2571 lines
72 KiB
C++
2571 lines
72 KiB
C++
/*
|
|
* Copyright (c) 2016, 2017, 2018, 2022 Jonas 'Sortie' Termansen.
|
|
*
|
|
* Permission to use, copy, modify, and distribute this software for any
|
|
* purpose with or without fee is hereby granted, provided that the above
|
|
* copyright notice and this permission notice appear in all copies.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
|
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
|
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
|
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
|
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
|
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
|
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
|
*
|
|
* net/tcp.cpp
|
|
* Transmission Control Protocol.
|
|
*/
|
|
|
|
// TODO: Implement sending back RST and such.
|
|
// TODO: PUSH.
|
|
// TODO: URG.
|
|
// TODO: Nagle's algorithm, MSG_MORE, TCP_CORK, TCP_NODELAY, etc.
|
|
// TODO: TCP options.
|
|
// TODO: Maximum Segment Size (respect TCP_MSS).
|
|
// TODO: Efficient receieve queue when out of order.
|
|
// TODO: Efficient backlog / half-open. Avoid denial of service attacks.
|
|
// TODO: Measure average round trip time for efficient retransmission?
|
|
// TODO: High speed extensions.
|
|
// TODO: Anti-congestion extensions.
|
|
// TODO: Selective acknowledgements.
|
|
// TODO: Implement all RFC 1122 TCP requirements.
|
|
// TODO: Probing Zero Windows per RFC 1122 4.2.2.17.
|
|
// TODO: os-test all the things.
|
|
|
|
#include <sys/socket.h>
|
|
#include <sys/stat.h>
|
|
|
|
#include <assert.h>
|
|
#include <endian.h>
|
|
#include <errno.h>
|
|
#include <fcntl.h>
|
|
#include <limits.h>
|
|
#include <netinet/in.h>
|
|
#include <netinet/tcp.h>
|
|
#include <poll.h>
|
|
#include <signal.h>
|
|
#include <stdint.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <time.h>
|
|
#include <timespec.h>
|
|
|
|
#ifndef IOV_MAX
|
|
#include <sortix/limits.h>
|
|
#endif
|
|
|
|
#include <sortix/kernel/clock.h>
|
|
#include <sortix/kernel/copy.h>
|
|
#include <sortix/kernel/if.h>
|
|
#include <sortix/kernel/inode.h>
|
|
#include <sortix/kernel/ioctx.h>
|
|
#include <sortix/kernel/kernel.h>
|
|
#include <sortix/kernel/kthread.h>
|
|
#include <sortix/kernel/packet.h>
|
|
#include <sortix/kernel/poll.h>
|
|
#include <sortix/kernel/process.h>
|
|
#include <sortix/kernel/sockopt.h>
|
|
#include <sortix/kernel/thread.h>
|
|
#include <sortix/kernel/time.h>
|
|
#include <sortix/kernel/timer.h>
|
|
#include <sortix/kernel/worker.h>
|
|
|
|
#include "ip.h"
|
|
#include "tcp.h"
|
|
|
|
#define BUFFER_SIZE 65536 // Documented in tcp(4).
|
|
|
|
#define NUM_RETRANSMISSIONS 6 // Documented in tcp(4)
|
|
|
|
namespace Sortix {
|
|
namespace TCP {
|
|
|
|
class TCPSocket;
|
|
|
|
union tcp_sockaddr
|
|
{
|
|
sa_family_t family;
|
|
struct sockaddr_in in;
|
|
struct sockaddr_in6 in6;
|
|
};
|
|
|
|
// The TCP states per STD 7 (RFC 793).
|
|
enum tcp_state
|
|
{
|
|
TCP_STATE_CLOSED,
|
|
TCP_STATE_LISTEN,
|
|
TCP_STATE_SYN_SENT,
|
|
TCP_STATE_SYN_RECV,
|
|
TCP_STATE_ESTAB,
|
|
TCP_STATE_FIN_WAIT_1,
|
|
TCP_STATE_FIN_WAIT_2,
|
|
TCP_STATE_CLOSE_WAIT,
|
|
TCP_STATE_CLOSING,
|
|
TCP_STATE_LAST_ACK,
|
|
TCP_STATE_TIME_WAIT,
|
|
};
|
|
|
|
enum tcp_special
|
|
{
|
|
TCP_SPECIAL_NOT,
|
|
TCP_SPECIAL_PENDING,
|
|
TCP_SPECIAL_WINDOW,
|
|
TCP_SPECIAL_ACKED,
|
|
};
|
|
|
|
// Global lock protecting all TCP sockets as they need to access each other.
|
|
static kthread_mutex_t tcp_lock = KTHREAD_MUTEX_INITIALIZER;
|
|
|
|
static TCPSocket** bindings_v4;
|
|
static TCPSocket** bindings_v6;
|
|
|
|
void Init()
|
|
{
|
|
if ( !(bindings_v4 = new TCPSocket*[65536]) ||
|
|
!(bindings_v6 = new TCPSocket*[65536]) )
|
|
Panic("Failed to allocate TCP Socket bindings");
|
|
for ( size_t i = 0; i < 65536; i++ )
|
|
{
|
|
bindings_v4[i] = NULL;
|
|
bindings_v6[i] = NULL;
|
|
}
|
|
}
|
|
|
|
static inline bool mod32_le(tcp_seq a, tcp_seq b)
|
|
{
|
|
return (int32_t) (a - b) <= 0;
|
|
}
|
|
|
|
static inline bool mod32_lt(tcp_seq a, tcp_seq b)
|
|
{
|
|
return (int32_t) (a - b) < 0;
|
|
}
|
|
|
|
static inline bool mod32_ge(tcp_seq a, tcp_seq b)
|
|
{
|
|
return (int32_t) (a - b) >= 0;
|
|
}
|
|
|
|
static inline bool mod32_gt(tcp_seq a, tcp_seq b)
|
|
{
|
|
return (int32_t) (a - b) > 0;
|
|
}
|
|
|
|
static bool IsSupportedAddressFamily(int af)
|
|
{
|
|
return af == AF_INET /* TODO: || af == AF_INET6 */;
|
|
}
|
|
|
|
static size_t AddressFamilySize(int af)
|
|
{
|
|
switch ( af )
|
|
{
|
|
case AF_INET: return sizeof(struct sockaddr_in);
|
|
case AF_INET6: return sizeof(struct sockaddr_in6);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
// The TCP socket implementation. It is separate from the class TCPSocketNode
|
|
// as that class is reference counted, but this class manages its own lifetime
|
|
// so the socket is properly shut down after all references are closed.
|
|
//
|
|
// Bound sockets are in a double linked list starting from the appropriate
|
|
// bindings array indexed by the port, and then the sockets on that port are
|
|
// doubly linked using prev_socket and next_socket.
|
|
//
|
|
// Half-open sockets are in a doubly linked list starting from connecting_half
|
|
// in the listening socket, and then doubly linked with connecting_prev and
|
|
// connecting_next (with connecting_parent going back to the listening socket).
|
|
//
|
|
// Ready sockets that have not yet been accepted are in a doubly linked list
|
|
// starting from connecting_ready in the listening socket, and then doubly
|
|
// linked with connecting_prev and connecting_next (with connecting_parent going
|
|
// back to the listening socket).
|
|
//
|
|
// A socket wants to be deleted when it's in the CLOSED state and is not
|
|
// referenced by its TCPSocketNode anymore. Deletion is possible when the timer
|
|
// is not pending.
|
|
class TCPSocket
|
|
{
|
|
friend void HandleIP(Ref<Packet> pkt,
|
|
const struct in_addr* src,
|
|
const struct in_addr* dst,
|
|
bool dst_broadcast);
|
|
|
|
public:
|
|
TCPSocket(int af);
|
|
~TCPSocket();
|
|
Ref<Inode> accept4(ioctx_t* ctx, uint8_t* addr, size_t* addrsize,
|
|
int flags);
|
|
int bind(ioctx_t* ctx, const uint8_t* addr, size_t addrsize);
|
|
int connect(ioctx_t* ctx, const uint8_t* addr, size_t addrsize);
|
|
int listen(ioctx_t* ctx, int backlog);
|
|
ssize_t recv(ioctx_t* ctx, uint8_t* buf, size_t count, int flags);
|
|
ssize_t send(ioctx_t* ctx, const uint8_t* buf, size_t count, int flags);
|
|
ssize_t sendmsg(ioctx_t* ctx, const struct msghdr* msg_ptr, int flags);
|
|
ssize_t read(ioctx_t* ctx, uint8_t* buf, size_t count);
|
|
ssize_t recvmsg(ioctx_t* ctx, struct msghdr* msg, int flags);
|
|
ssize_t write(ioctx_t* ctx, const uint8_t* buf, size_t count);
|
|
int poll(ioctx_t* ctx, PollNode* node);
|
|
int getsockopt(ioctx_t* ctx, int level, int option_name, void* option_value,
|
|
size_t* option_size_ptr);
|
|
int setsockopt(ioctx_t* ctx, int level, int option_name,
|
|
const void* option_value, size_t option_size);
|
|
int shutdown(ioctx_t* ctx, int how);
|
|
int getpeername(ioctx_t* ctx, uint8_t* addr, size_t* addrsize);
|
|
int getsockname(ioctx_t* ctx, uint8_t* addr, size_t* addrsize);
|
|
|
|
public:
|
|
void Unreference();
|
|
void ProcessPacket(Ref<Packet> pkt, union tcp_sockaddr* pkt_src,
|
|
union tcp_sockaddr* pkt_dst);
|
|
void ReceivePacket(Ref<Packet> pkt, union tcp_sockaddr* pkt_src,
|
|
union tcp_sockaddr* pkt_dst);
|
|
void OnTimer();
|
|
|
|
inline bool want_destruction()
|
|
{
|
|
return state == TCP_STATE_CLOSED && !is_referenced;
|
|
}
|
|
|
|
inline bool can_destroy()
|
|
{
|
|
return want_destruction() && !timer_armed;
|
|
}
|
|
|
|
private:
|
|
short PollEventStatus();
|
|
bool ImportAddress(ioctx_t* ctx, union tcp_sockaddr* dest, const void* addr,
|
|
size_t addrsize);
|
|
bool CanBind(union tcp_sockaddr new_local);
|
|
bool BindDefault(const union tcp_sockaddr* new_local_ptr);
|
|
void UpdateWindow(uint16_t new_window);
|
|
void TransmitLoop();
|
|
bool Transmit();
|
|
void ScheduleTransmit();
|
|
void SetDeadline();
|
|
void SetTimer();
|
|
void Close();
|
|
void Destroy();
|
|
void Disconnect();
|
|
void Fail(int error);
|
|
ssize_t recv_unlocked(ioctx_t* ctx, uint8_t* buf, size_t count, int flags);
|
|
ssize_t send_unlocked(ioctx_t* ctx, const uint8_t* buf, size_t count,
|
|
int flags);
|
|
int shutdown_unlocked(int how);
|
|
|
|
public:
|
|
// The previous socket bound on the same port in the address family.
|
|
TCPSocket* prev_socket;
|
|
|
|
// The next socket bound on the same port in the address family.
|
|
TCPSocket* next_socket;
|
|
|
|
// The first half-connected socket in our listening queue.
|
|
TCPSocket* connecting_half;
|
|
|
|
// The first ready socket in our listening queue.
|
|
TCPSocket* connecting_ready;
|
|
|
|
// The previous half-connected or ready socket in our listening queue.
|
|
TCPSocket* connecting_prev;
|
|
|
|
// The next half-connected or ready socket in our listening queue.
|
|
TCPSocket* connecting_next;
|
|
|
|
// The listening socket this socket is in the listening queue for.
|
|
TCPSocket* connecting_parent;
|
|
|
|
// Condition variable that is signaled when new data can be received.
|
|
kthread_cond_t receive_cond;
|
|
|
|
// Condition variable that is signaled when new data can be transmitted.
|
|
kthread_cond_t transmit_cond;
|
|
|
|
// The local socket name, or the any address port 0 if not set.
|
|
union tcp_sockaddr local;
|
|
|
|
// The remote socket name, or the any address port 0 if not set.
|
|
union tcp_sockaddr remote;
|
|
|
|
// The network interface the socket is bound to, or 0 if none.
|
|
unsigned int ifindex;
|
|
|
|
// Whether the socket has been bound to a port.
|
|
bool bound;
|
|
|
|
// Whether the socket is receiving datagrams.
|
|
bool remoted;
|
|
|
|
// Whether SO_REUSEADDR is set.
|
|
bool reuseaddr;
|
|
|
|
// Whether the socket is referenced from anywhere and must not deallocate.
|
|
bool is_referenced;
|
|
|
|
private:
|
|
// The timer used for retransmissions and timing out the connection.
|
|
Timer timer;
|
|
|
|
// The poll channel to publish poll bit changes on.
|
|
PollChannel poll_channel;
|
|
|
|
// The queue of incoming packets whose sequence numbers are too high to
|
|
// process right now, sorted by increasing sequence number.
|
|
Ref<Packet> receive_queue; // TODO: Not a good way to keep track of this.
|
|
|
|
// The deadline for the remote to acknowledge before retransmitting.
|
|
struct timespec deadline;
|
|
|
|
// The offset at which data begins in the incoming ring buffer.
|
|
size_t incoming_offset;
|
|
|
|
// The amount of bytes in the incoming ring buffer.
|
|
size_t incoming_used;
|
|
|
|
// The offset at which data begins in the outgoing ring buffer.
|
|
size_t outgoing_offset;
|
|
|
|
// The amount of bytes in the outgoing ring buffer.
|
|
size_t outgoing_used;
|
|
|
|
// Send unacknowledged (STD 7, RFC 793).
|
|
tcp_seq send_una;
|
|
|
|
// Send next (STD 7, RFC 793).
|
|
tcp_seq send_nxt;
|
|
|
|
// Send window (STD 7, RFC 793).
|
|
tcp_seq send_wnd;
|
|
|
|
// Send urgent pointer (STD 7, RFC 793).
|
|
tcp_seq send_up;
|
|
|
|
// Segment sequence number used for last window update (STD 7, RFC 793).
|
|
tcp_seq send_wl1;
|
|
|
|
// Segment acknowledgment number used for last window update (STD 7, RFC
|
|
// 793).
|
|
tcp_seq send_wl2;
|
|
|
|
// Next sequence to send (STD 7, RFC 793).
|
|
tcp_seq send_pos;
|
|
|
|
// Initial send sequence number (STD 7, RFC 793).
|
|
tcp_seq iss;
|
|
|
|
// Receive next (STD 7, RFC 793).
|
|
tcp_seq recv_nxt;
|
|
|
|
// Receive window (STD 7, RFC 793).
|
|
tcp_seq recv_wnd;
|
|
|
|
// Receive urgent pointer (STD 7, RFC 793).
|
|
tcp_seq recv_up;
|
|
|
|
// Last sequence acked (STD 7, RFC 793).
|
|
tcp_seq recv_acked;
|
|
|
|
// Last window size advertised (STD 7, RFC 793).
|
|
tcp_seq recv_wndlast;
|
|
|
|
// Initial receive sequence number (STD 7, RFC 793).
|
|
tcp_seq irs;
|
|
|
|
// The address family to which this socket belongs.
|
|
int af;
|
|
|
|
// Set to an errno value if a socket error has occured, or 0 otherwise.
|
|
int sockerr;
|
|
|
|
// The number of sockets in the listening queue.
|
|
int backlog_used;
|
|
|
|
// The maximum number of sockets in the listening queue.
|
|
int backlog_max;
|
|
|
|
// The number of retransmissions that have occured since the last
|
|
// acknowledgement from the remote socket.
|
|
unsigned int retransmissions;
|
|
|
|
// The current TCP state.
|
|
enum tcp_state state;
|
|
|
|
// The state of the outgoing SYN.
|
|
enum tcp_special outgoing_syn;
|
|
|
|
// The state of the outgoing FIN.
|
|
enum tcp_special outgoing_fin;
|
|
|
|
// Whether SYN has been received from the remote socket.
|
|
bool has_syn;
|
|
|
|
// Whether FIN has been received from the remote socket.
|
|
bool has_fin;
|
|
|
|
// Whether a transmission has been scheduled.
|
|
bool transmit_scheduled;
|
|
|
|
// Whether the timer is pending.
|
|
bool timer_armed;
|
|
|
|
// Whether the socket has been shut down for receive.
|
|
bool shutdown_receive;
|
|
|
|
// The incoming ring buffer.
|
|
unsigned char incoming[BUFFER_SIZE];
|
|
|
|
// The outgoing ring buffer.
|
|
unsigned char outgoing[BUFFER_SIZE];
|
|
};
|
|
|
|
// The TCP socket Inode with a reference counted lifetime. The backend class
|
|
// TCPSocket is separate as it may stay alive for a little while after all
|
|
// references to it has been lost.
|
|
class TCPSocketNode : public AbstractInode
|
|
{
|
|
public:
|
|
TCPSocketNode(TCPSocket* socket);
|
|
virtual ~TCPSocketNode();
|
|
virtual Ref<Inode> accept4(ioctx_t* ctx, uint8_t* addr, size_t* addrsize,
|
|
int flags);
|
|
virtual int bind(ioctx_t* ctx, const uint8_t* addr, size_t addrsize);
|
|
virtual int connect(ioctx_t* ctx, const uint8_t* addr, size_t addrsize);
|
|
virtual int listen(ioctx_t* ctx, int backlog);
|
|
virtual ssize_t recv(ioctx_t* ctx, uint8_t* buf, size_t count, int flags);
|
|
virtual ssize_t recvmsg(ioctx_t* ctx, struct msghdr* msg, int flags);
|
|
virtual ssize_t send(ioctx_t* ctx, const uint8_t* buf, size_t count,
|
|
int flags);
|
|
virtual ssize_t sendmsg(ioctx_t* ctx, const struct msghdr* msg_ptr,
|
|
int flags);
|
|
virtual ssize_t read(ioctx_t* ctx, uint8_t* buf, size_t count);
|
|
virtual ssize_t write(ioctx_t* ctx, const uint8_t* buf, size_t count);
|
|
virtual int poll(ioctx_t* ctx, PollNode* node);
|
|
virtual int getsockopt(ioctx_t* ctx, int level, int option_name,
|
|
void* option_value, size_t* option_size_ptr);
|
|
virtual int setsockopt(ioctx_t* ctx, int level, int option_name,
|
|
const void* option_value, size_t option_size);
|
|
virtual int shutdown(ioctx_t* ctx, int how);
|
|
virtual int getpeername(ioctx_t* ctx, uint8_t* addr, size_t* addrsize);
|
|
virtual int getsockname(ioctx_t* ctx, uint8_t* addr, size_t* addrsize);
|
|
|
|
private:
|
|
TCPSocket* socket;
|
|
|
|
};
|
|
|
|
void TCPSocket__OnTimer(Clock* /*clock*/, Timer* /*timer*/, void* user)
|
|
{
|
|
((TCPSocket*) user)->OnTimer();
|
|
}
|
|
|
|
TCPSocket::TCPSocket(int af)
|
|
{
|
|
prev_socket = NULL;
|
|
next_socket = NULL;
|
|
connecting_half = NULL;
|
|
connecting_ready = NULL;
|
|
connecting_prev = NULL;
|
|
connecting_next = NULL;
|
|
connecting_parent = NULL;
|
|
receive_cond = KTHREAD_COND_INITIALIZER;
|
|
transmit_cond = KTHREAD_COND_INITIALIZER;
|
|
memset(&local, 0, sizeof(local));
|
|
memset(&remote, 0, sizeof(remote));
|
|
ifindex = 0;
|
|
bound = false;
|
|
remoted = false;
|
|
reuseaddr = false;
|
|
// timer is initialized by its constructor.
|
|
timer.Attach(Time::GetClock(CLOCK_MONOTONIC));
|
|
// poll_channel is initialized by its constructor.
|
|
// receive_queue is initialized by its constructor.
|
|
deadline = timespec_make(-1, 0);
|
|
incoming_offset = 0;
|
|
incoming_used = 0;
|
|
outgoing_offset = 0;
|
|
outgoing_used = 0;
|
|
send_una = 0;
|
|
send_nxt = 0;
|
|
send_wnd = 0;
|
|
send_up = 0;
|
|
send_wl1 = 0;
|
|
send_wl2 = 0;
|
|
send_pos = 0;
|
|
iss = 0;
|
|
recv_nxt = 0;
|
|
recv_wnd = 0;
|
|
recv_up = 0;
|
|
recv_acked = 0;
|
|
recv_wndlast = 0;
|
|
irs = 0;
|
|
this->af = af;
|
|
sockerr = 0;
|
|
backlog_used = 0;
|
|
backlog_max = 0;
|
|
retransmissions = 0;
|
|
state = TCP_STATE_CLOSED;
|
|
outgoing_syn = TCP_SPECIAL_NOT;
|
|
outgoing_fin = TCP_SPECIAL_NOT;
|
|
has_syn = false;
|
|
has_fin = false;
|
|
transmit_scheduled = false;
|
|
is_referenced = false;
|
|
timer_armed = false;
|
|
shutdown_receive = false;
|
|
memset(incoming, 0, sizeof(incoming));
|
|
memset(outgoing, 0, sizeof(outgoing));
|
|
}
|
|
|
|
TCPSocket::~TCPSocket()
|
|
{
|
|
assert(state == TCP_STATE_CLOSED);
|
|
assert(!bound);
|
|
assert(!prev_socket);
|
|
assert(!next_socket);
|
|
assert(!connecting_half);
|
|
assert(!connecting_half);
|
|
assert(!connecting_ready);
|
|
assert(!connecting_prev);
|
|
assert(!connecting_next);
|
|
assert(!connecting_parent);
|
|
assert(!is_referenced);
|
|
// Avoid stack overflow in receive_queue recursive destructor.
|
|
while ( receive_queue )
|
|
{
|
|
Ref<Packet> packet = receive_queue;
|
|
receive_queue = packet->next;
|
|
packet->next.Reset();
|
|
}
|
|
}
|
|
|
|
void TCPSocket::Unreference()
|
|
{
|
|
kthread_mutex_lock(&tcp_lock);
|
|
is_referenced = false;
|
|
Disconnect();
|
|
bool do_delete = can_destroy();
|
|
kthread_mutex_unlock(&tcp_lock);
|
|
if ( do_delete )
|
|
delete this;
|
|
}
|
|
|
|
void TCPSocket::Close() // tcp_lock taken
|
|
{
|
|
if ( timer_armed && timer.TryCancel() )
|
|
timer_armed = false;
|
|
Destroy();
|
|
state = TCP_STATE_CLOSED;
|
|
kthread_cond_broadcast(&transmit_cond);
|
|
kthread_cond_broadcast(&receive_cond);
|
|
deadline = timespec_make(-1, 0);
|
|
SetTimer();
|
|
}
|
|
|
|
void TCPSocket::Disconnect() // tcp_lock taken
|
|
{
|
|
if ( state == TCP_STATE_LISTEN )
|
|
Close();
|
|
else if ( state != TCP_STATE_CLOSED )
|
|
{
|
|
// TODO: Send back RST if the peer sends when we're not receiving.
|
|
if ( state == TCP_STATE_SYN_SENT ||
|
|
state == TCP_STATE_SYN_RECV ||
|
|
state == TCP_STATE_ESTAB ||
|
|
state == TCP_STATE_CLOSE_WAIT )
|
|
{
|
|
shutdown_unlocked(SHUT_WR);
|
|
// CLOSED, LAST_ACK, FIN_WAIT_1.
|
|
}
|
|
// FIN_WAIT_1 will enter FIN_WAIT_2 or time out.
|
|
// FIN_WAIT_2 will time out when unreferenced.
|
|
// CLOSING will resend FIN or time out.
|
|
// LAST_ACK will resend FIN or time out.
|
|
// TIME_WAIT will time out and close.
|
|
if ( state == TCP_STATE_FIN_WAIT_2 && !is_referenced )
|
|
{
|
|
deadline = timespec_make(-1, 0);
|
|
SetDeadline();
|
|
SetTimer();
|
|
}
|
|
}
|
|
}
|
|
|
|
void TCPSocket::Fail(int error)
|
|
{
|
|
sockerr = error;
|
|
Close();
|
|
}
|
|
|
|
void TCPSocket::Destroy() // tcp_lock taken
|
|
{
|
|
if ( bound )
|
|
{
|
|
if ( af == AF_INET )
|
|
{
|
|
uint16_t port = be16toh(local.in.sin_port);
|
|
if ( prev_socket )
|
|
prev_socket->next_socket = next_socket;
|
|
else
|
|
bindings_v4[port] = next_socket;
|
|
if ( next_socket )
|
|
next_socket->prev_socket = prev_socket;
|
|
}
|
|
else if ( af == AF_INET6 )
|
|
{
|
|
uint16_t port = be16toh(local.in6.sin6_port);
|
|
if ( prev_socket )
|
|
prev_socket->next_socket = next_socket;
|
|
else
|
|
bindings_v6[port] = next_socket;
|
|
if ( next_socket )
|
|
next_socket->prev_socket = prev_socket;
|
|
}
|
|
prev_socket = NULL;
|
|
next_socket = NULL;
|
|
bound = false;
|
|
}
|
|
while ( connecting_half || connecting_ready )
|
|
{
|
|
TCPSocket* socket;
|
|
if ( connecting_half )
|
|
{
|
|
socket = connecting_half;
|
|
connecting_half = socket->connecting_next;
|
|
if ( connecting_half )
|
|
connecting_half->connecting_prev = NULL;
|
|
}
|
|
else
|
|
{
|
|
socket = connecting_ready;
|
|
connecting_ready = socket->connecting_next;
|
|
if ( connecting_ready )
|
|
connecting_ready->connecting_prev = NULL;
|
|
}
|
|
socket->connecting_prev = NULL;
|
|
socket->connecting_next = NULL;
|
|
socket->connecting_parent = NULL;
|
|
backlog_used--;
|
|
socket->Disconnect();
|
|
}
|
|
if ( connecting_parent )
|
|
{
|
|
if ( connecting_prev )
|
|
connecting_prev->connecting_next = connecting_next;
|
|
else if ( state == TCP_STATE_SYN_RECV )
|
|
connecting_parent->connecting_half = connecting_next;
|
|
else
|
|
connecting_parent->connecting_ready = connecting_next;
|
|
if ( connecting_next )
|
|
connecting_next->connecting_prev = connecting_prev;
|
|
connecting_prev = NULL;
|
|
connecting_next = NULL;
|
|
connecting_parent->backlog_used--;
|
|
connecting_parent = NULL;
|
|
}
|
|
}
|
|
|
|
Ref<Inode> TCPSocket::accept4(ioctx_t* ctx, uint8_t* addr, size_t* addrsize_ptr,
|
|
int flags)
|
|
{
|
|
if ( flags & ~(0) )
|
|
return errno = EINVAL, Ref<Inode>(NULL);
|
|
if ( addr && !addrsize_ptr )
|
|
return errno = EINVAL, Ref<Inode>(NULL);
|
|
ScopedLock lock(&tcp_lock);
|
|
if ( state != TCP_STATE_LISTEN )
|
|
return errno = EINVAL, Ref<Inode>(NULL);
|
|
while ( !connecting_ready )
|
|
{
|
|
if ( ctx->dflags & O_NONBLOCK )
|
|
return errno = EWOULDBLOCK, Ref<Inode>(NULL);
|
|
if ( !kthread_cond_wait_signal(&receive_cond, &tcp_lock) )
|
|
return errno = EINTR, Ref<Inode>(NULL);
|
|
}
|
|
TCPSocket* socket = connecting_ready;
|
|
if ( addr )
|
|
{
|
|
size_t addrsize;
|
|
if ( !ctx->copy_from_src(&addrsize, addrsize_ptr, sizeof(addrsize)) )
|
|
return Ref<Inode>(NULL);
|
|
size_t af_addrsize = AddressFamilySize(af);
|
|
if ( af_addrsize < addrsize )
|
|
addrsize = af_addrsize;
|
|
if ( !ctx->copy_to_dest(addr, &socket->remote, addrsize) )
|
|
return Ref<Inode>(NULL);
|
|
if ( !ctx->copy_to_dest(addrsize_ptr, &addrsize, sizeof(addrsize)) )
|
|
return Ref<Inode>(NULL);
|
|
}
|
|
Ref<TCPSocketNode> result(new TCPSocketNode(socket));
|
|
if ( !result )
|
|
return Ref<Inode>(NULL);
|
|
connecting_ready = socket->connecting_next;
|
|
if ( connecting_ready )
|
|
connecting_ready->connecting_prev = NULL;
|
|
socket->connecting_prev = NULL;
|
|
socket->connecting_next = NULL;
|
|
socket->connecting_parent = NULL;
|
|
backlog_used--;
|
|
return result;
|
|
}
|
|
|
|
bool TCPSocket::ImportAddress(ioctx_t* ctx,
|
|
union tcp_sockaddr* dest,
|
|
const void* addr,
|
|
size_t addrsize)
|
|
{
|
|
// TODO: os-test whether AF_UNSPEC can disconnect.
|
|
if ( addrsize != AddressFamilySize(af) )
|
|
return errno = EINVAL, -1;
|
|
union tcp_sockaddr copy;
|
|
memset(©, 0, sizeof(copy));
|
|
if ( !ctx->copy_from_src(©, addr, addrsize) )
|
|
return false;
|
|
if ( copy.family != af )
|
|
return errno = EAFNOSUPPORT, false;
|
|
memcpy(dest, ©, sizeof(copy));
|
|
return true;
|
|
}
|
|
|
|
bool TCPSocket::CanBind(union tcp_sockaddr new_local) // tcp_lock taken
|
|
{
|
|
if ( af == AF_INET )
|
|
{
|
|
// TODO: os-test binding to broadcast addresses.
|
|
// Bind to either the any address or the address of a network interface.
|
|
if ( new_local.in.sin_addr.s_addr != htobe32(INADDR_ANY) )
|
|
{
|
|
// TODO: What happens to sockets if the network interface changes
|
|
// its address?
|
|
ScopedLock ifs_lock(&netifs_lock);
|
|
bool found = false;
|
|
for ( unsigned int i = 1; i < netifs_count; i++ )
|
|
{
|
|
NetworkInterface* netif = netifs[i];
|
|
if ( !netif )
|
|
continue;
|
|
ScopedLock cfg_lock(&netif->cfg_lock);
|
|
if ( memcmp(&netif->cfg.inet.address, &new_local.in.sin_addr,
|
|
sizeof(struct in_addr)) == 0 )
|
|
{
|
|
found = true;
|
|
break;
|
|
}
|
|
}
|
|
// No interface had the correct address.
|
|
if ( !found )
|
|
return errno = EADDRNOTAVAIL, false;
|
|
}
|
|
uint16_t port = be16toh(new_local.in.sin_port);
|
|
for ( TCPSocket* socket = bindings_v4[port];
|
|
socket;
|
|
socket = socket->next_socket )
|
|
{
|
|
// TODO: os-test how SO_REUSEADDR works for TCP.
|
|
if ( new_local.in.sin_addr.s_addr == htobe32(INADDR_ANY) &&
|
|
!(reuseaddr && socket->reuseaddr) )
|
|
return errno = EADDRINUSE, false;
|
|
if ( socket->local.in.sin_addr.s_addr == htobe32(INADDR_ANY) &&
|
|
!(reuseaddr && socket->reuseaddr) )
|
|
return errno = EADDRINUSE, false;
|
|
if ( new_local.in.sin_addr.s_addr ==
|
|
socket->local.in.sin_addr.s_addr )
|
|
return errno = EADDRINUSE, false;
|
|
}
|
|
}
|
|
else if ( af == AF_INET6 )
|
|
{
|
|
// TODO: IPv6 support for seeing if any interface has the address.
|
|
if ( true )
|
|
return errno = EAFNOSUPPORT, false;
|
|
uint16_t port = be16toh(new_local.in6.sin6_port);
|
|
if ( bindings_v6[port] )
|
|
return errno = EADDRINUSE, false;
|
|
for ( TCPSocket* socket = bindings_v6[port];
|
|
socket;
|
|
socket = socket->next_socket )
|
|
{
|
|
if ( !memcmp(&new_local.in6.sin6_addr, &in6addr_any,
|
|
sizeof(in6addr_any)) &&
|
|
!(reuseaddr && socket->reuseaddr) )
|
|
if ( !memcmp(&socket->local.in6.sin6_addr, &in6addr_any,
|
|
sizeof(in6addr_any)) &&
|
|
!(reuseaddr && socket->reuseaddr) )
|
|
if ( !memcmp(&new_local.in6.sin6_addr, &socket->local.in6.sin6_addr,
|
|
sizeof(new_local.in6.sin6_addr)) )
|
|
return errno = EADDRINUSE, false;
|
|
}
|
|
}
|
|
else
|
|
return errno = EAFNOSUPPORT, false;
|
|
return true;
|
|
}
|
|
|
|
int TCPSocket::bind(ioctx_t* ctx, const uint8_t* addr, size_t addrsize)
|
|
{
|
|
ScopedLock lock(&tcp_lock);
|
|
if ( bound )
|
|
return errno = EINVAL, -1;
|
|
union tcp_sockaddr new_local;
|
|
if ( !ImportAddress(ctx, &new_local, addr, addrsize) )
|
|
return -1;
|
|
uint16_t port;
|
|
if ( af == AF_INET )
|
|
port = be16toh(new_local.in.sin_port);
|
|
else if ( af == AF_INET6 )
|
|
port = be16toh(new_local.in6.sin6_port);
|
|
else
|
|
return errno = EAFNOSUPPORT, -1;
|
|
// TODO: Binding to the any address needs to pick the appropriate source
|
|
// interface and bind to its address. (Or really? udp doesn't?
|
|
// os-test?)
|
|
// TODO: os-test a server listening on any, and then getsockname a
|
|
// connection received on that port.
|
|
if ( port == 0 )
|
|
return BindDefault(&new_local) ? 0 : -1;
|
|
if ( !CanBind(new_local) )
|
|
return -1;
|
|
if ( af == AF_INET )
|
|
{
|
|
uint16_t port = be16toh(new_local.in.sin_port);
|
|
if ( bindings_v4[port] )
|
|
bindings_v4[port]->prev_socket = this;
|
|
next_socket = bindings_v4[port];
|
|
prev_socket = NULL;
|
|
bindings_v4[port] = this;
|
|
}
|
|
else if ( af == AF_INET6 )
|
|
{
|
|
uint16_t port = be16toh(new_local.in6.sin6_port);
|
|
if ( bindings_v6[port] )
|
|
return errno = EADDRINUSE, -1;
|
|
next_socket = bindings_v6[port];
|
|
prev_socket = NULL;
|
|
bindings_v6[port] = this;
|
|
}
|
|
else
|
|
return errno = EAFNOSUPPORT, -1;
|
|
memcpy(&local, &new_local, sizeof(new_local));
|
|
bound = true;
|
|
return 0;
|
|
}
|
|
|
|
// tcp_lock locked
|
|
bool TCPSocket::BindDefault(const union tcp_sockaddr* new_local_ptr)
|
|
{
|
|
// TODO: This allocator becomes increasingly biased as more ports are
|
|
// allocated.
|
|
// TODO: Try not to allocate recently used ports.
|
|
union tcp_sockaddr new_local;
|
|
if ( new_local_ptr )
|
|
memcpy(&new_local, new_local_ptr, sizeof(union tcp_sockaddr));
|
|
else
|
|
{
|
|
memset(&new_local, 0, sizeof(new_local));
|
|
if ( af == AF_INET )
|
|
{
|
|
new_local.in.sin_family = AF_INET;
|
|
new_local.in.sin_addr.s_addr = htobe32(INADDR_ANY);
|
|
}
|
|
else if ( af == AF_INET6 )
|
|
{
|
|
new_local.in6.sin6_family = AF_INET6;
|
|
new_local.in6.sin6_addr = in6addr_any;
|
|
}
|
|
else
|
|
return errno = EAFNOSUPPORT, false;
|
|
}
|
|
uint16_t start = 32768; // Documented in tcp(4).
|
|
uint16_t end = 61000; // Documented in tcp(4).
|
|
uint16_t count = end - start;
|
|
uint16_t offset = arc4random_uniform(count);
|
|
for ( uint16_t i = 0; i < count; i++ )
|
|
{
|
|
uint16_t j = offset + i;
|
|
if ( count <= j )
|
|
j -= count;
|
|
uint16_t port = start + j;
|
|
if ( af == AF_INET )
|
|
new_local.in.sin_port = htobe16(port);
|
|
else if ( af == AF_INET6 )
|
|
new_local.in6.sin6_port = htobe16(port);
|
|
else
|
|
return errno = EAFNOSUPPORT, false;
|
|
if ( !CanBind(new_local) )
|
|
{
|
|
if ( errno == EADDRINUSE )
|
|
continue;
|
|
return false;
|
|
}
|
|
if ( af == AF_INET )
|
|
{
|
|
if ( bindings_v4[port] )
|
|
bindings_v4[port]->prev_socket = this;
|
|
next_socket = bindings_v4[port];
|
|
prev_socket = NULL;
|
|
bindings_v4[port] = this;
|
|
}
|
|
else if ( af == AF_INET6 )
|
|
{
|
|
if ( bindings_v6[port] )
|
|
bindings_v6[port]->prev_socket = this;
|
|
next_socket = bindings_v6[port];
|
|
prev_socket = NULL;
|
|
bindings_v6[port] = this;
|
|
}
|
|
else
|
|
return errno = EAFNOSUPPORT, false;
|
|
memcpy(&local, &new_local, sizeof(new_local));
|
|
bound = true;
|
|
return true;
|
|
}
|
|
return errno = EAGAIN, false;
|
|
}
|
|
|
|
void TCPSocket::TransmitLoop() // tcp_lock taken
|
|
{
|
|
if ( state == TCP_STATE_CLOSED )
|
|
return;
|
|
if ( NUM_RETRANSMISSIONS <= retransmissions )
|
|
{
|
|
Fail(ETIMEDOUT);
|
|
return;
|
|
}
|
|
if ( !Transmit() && NUM_RETRANSMISSIONS - 1 <= retransmissions )
|
|
{
|
|
Fail(errno);
|
|
return;
|
|
}
|
|
}
|
|
|
|
bool TCPSocket::Transmit() // tcp_lock taken
|
|
{
|
|
if ( state == TCP_STATE_CLOSED )
|
|
return (errno = sockerr ? sockerr : ENOTCONN), false;
|
|
|
|
// Move new outgoing data into the transmission window if there is room.
|
|
tcp_seq window_available = (tcp_seq) (send_una + send_wnd - send_nxt);
|
|
if ( window_available && outgoing_syn == TCP_SPECIAL_PENDING )
|
|
{
|
|
send_nxt++;
|
|
outgoing_syn = TCP_SPECIAL_WINDOW;
|
|
window_available--;
|
|
}
|
|
if ( window_available )
|
|
{
|
|
tcp_seq window_data = (tcp_seq)(send_nxt - send_una);
|
|
if ( outgoing_syn == TCP_SPECIAL_WINDOW )
|
|
window_data--;
|
|
if ( outgoing_fin == TCP_SPECIAL_WINDOW )
|
|
window_data--;
|
|
assert(window_data <= outgoing_used);
|
|
size_t outgoing_new = outgoing_used - window_data;
|
|
tcp_seq amount = window_available;
|
|
if ( outgoing_new < amount )
|
|
amount = outgoing_new;
|
|
send_nxt += amount;
|
|
window_available -= amount;
|
|
}
|
|
if ( window_available && outgoing_fin == TCP_SPECIAL_PENDING )
|
|
{
|
|
send_nxt++;
|
|
outgoing_fin = TCP_SPECIAL_WINDOW;
|
|
window_available--;
|
|
}
|
|
|
|
// Transmit packets.
|
|
bool any = false;
|
|
while ( mod32_lt(send_pos, send_nxt) ||
|
|
(has_syn && mod32_lt(recv_acked, recv_nxt)) ||
|
|
recv_wnd != recv_wndlast )
|
|
{
|
|
any = true;
|
|
size_t mtu;
|
|
union tcp_sockaddr sendfrom;
|
|
if ( af == AF_INET )
|
|
{
|
|
if ( !IP::GetSourceIP(&local.in.sin_addr, &remote.in.sin_addr,
|
|
&sendfrom.in.sin_addr, ifindex, &mtu) )
|
|
return false;
|
|
}
|
|
// TODO: IPv6 support.
|
|
else
|
|
return errno = EAFNOSUPPORT, false;
|
|
if ( mtu < sizeof(struct tcphdr) )
|
|
return errno = EINVAL, false;
|
|
mtu -= sizeof(struct tcphdr);
|
|
Ref<Packet> pkt = GetPacket();
|
|
if ( !pkt )
|
|
return false;
|
|
pkt->length = sizeof(struct tcphdr);
|
|
unsigned char* out = pkt->from;
|
|
struct tcphdr hdr;
|
|
if ( af == AF_INET )
|
|
{
|
|
hdr.th_sport = local.in.sin_port;
|
|
hdr.th_dport = remote.in.sin_port;
|
|
}
|
|
else if ( af == AF_INET6 )
|
|
{
|
|
hdr.th_sport = local.in6.sin6_port;
|
|
hdr.th_dport = remote.in6.sin6_port;
|
|
}
|
|
else
|
|
return errno = EAFNOSUPPORT, false;
|
|
hdr.th_seq = htobe32(send_pos);
|
|
hdr.th_offset = TCP_OFFSET_ENCODE(sizeof(struct tcphdr) / 4);
|
|
hdr.th_flags = 0;
|
|
tcp_seq send_nxtpos = send_pos;
|
|
assert(mod32_le(send_nxtpos, send_nxt));
|
|
if ( outgoing_syn == TCP_SPECIAL_WINDOW && send_nxtpos == send_una )
|
|
{
|
|
hdr.th_flags |= TH_SYN;
|
|
send_nxtpos++;
|
|
}
|
|
assert(mod32_le(send_nxtpos, send_nxt));
|
|
if ( has_syn )
|
|
{
|
|
// TODO: RFC 1122 4.2.2.6:
|
|
// "TCP SHOULD send an MSS (Maximum Segment Size) option in
|
|
// every SYN segment when its receive MSS differs from the
|
|
// default 536, and MAY send it always."
|
|
// "If an MSS option is not received at connection setup, TCP
|
|
// MUST assume a default send MSS of 536 (576-40)."
|
|
hdr.th_flags |= TH_ACK;
|
|
hdr.th_ack = htobe32(recv_nxt);
|
|
}
|
|
else
|
|
hdr.th_ack = htobe32(0);
|
|
hdr.th_win = htobe16(recv_wnd);
|
|
hdr.th_urp = htobe16(0);
|
|
hdr.th_sum = htobe16(0);
|
|
tcp_seq window_data = (tcp_seq)(send_nxt - send_pos);
|
|
if ( send_pos == send_una && outgoing_syn == TCP_SPECIAL_WINDOW )
|
|
window_data--;
|
|
if ( mod32_lt(send_pos, send_nxt) &&
|
|
outgoing_fin == TCP_SPECIAL_WINDOW )
|
|
window_data--;
|
|
if ( window_data )
|
|
{
|
|
size_t amount = mtu < window_data ? mtu : window_data;
|
|
assert(outgoing_offset <= sizeof(outgoing));
|
|
tcp_seq window_length = (tcp_seq) (send_nxtpos - send_una);
|
|
if ( outgoing_syn == TCP_SPECIAL_WINDOW )
|
|
window_length--;
|
|
assert(window_length <= sizeof(outgoing));
|
|
size_t outgoing_end = outgoing_offset + window_length;
|
|
if ( sizeof(outgoing) <= outgoing_end )
|
|
outgoing_end -= sizeof(outgoing);
|
|
assert(outgoing_end < sizeof(outgoing));
|
|
size_t until_end = sizeof(outgoing) - outgoing_end;
|
|
size_t first = until_end < amount ? until_end : amount;
|
|
assert(first <= sizeof(outgoing));
|
|
assert(first <= sizeof(outgoing) - outgoing_end);
|
|
size_t second = amount - first;
|
|
assert(second <= sizeof(outgoing));
|
|
memcpy(out + sizeof(hdr), outgoing + outgoing_end, first);
|
|
if ( second )
|
|
memcpy(out + sizeof(hdr) + first, outgoing, second);
|
|
pkt->length += amount;
|
|
send_nxtpos += amount;
|
|
}
|
|
assert(mod32_le(send_nxtpos, send_nxt));
|
|
if ( outgoing_fin == TCP_SPECIAL_WINDOW &&
|
|
send_nxtpos + 1 == send_nxt )
|
|
{
|
|
hdr.th_flags |= TH_FIN;
|
|
send_nxtpos++;
|
|
}
|
|
assert(mod32_le(send_nxtpos, send_nxt));
|
|
memcpy(out, &hdr, sizeof(hdr));
|
|
uint16_t checksum = 0;
|
|
if ( af == AF_INET )
|
|
{
|
|
checksum = IP::ipsum_buf(checksum, &sendfrom.in.sin_addr,
|
|
sizeof(struct in_addr));
|
|
checksum = IP::ipsum_buf(checksum, &remote.in.sin_addr,
|
|
sizeof(struct in_addr));
|
|
}
|
|
else if ( af == AF_INET6 )
|
|
{
|
|
checksum = IP::ipsum_buf(checksum, &sendfrom.in6.sin6_addr,
|
|
sizeof(struct in6_addr));
|
|
checksum = IP::ipsum_buf(checksum, &remote.in6.sin6_addr,
|
|
sizeof(struct in6_addr));
|
|
}
|
|
else
|
|
return errno = EAFNOSUPPORT, false;
|
|
checksum = IP::ipsum_word(checksum, IPPROTO_TCP);
|
|
checksum = IP::ipsum_word(checksum, pkt->length);
|
|
checksum = IP::ipsum_buf(checksum, out, pkt->length);
|
|
hdr.th_sum = htobe16(IP::ipsum_finish(checksum));
|
|
memcpy(out, &hdr, sizeof(hdr));
|
|
if ( af == AF_INET )
|
|
{
|
|
if ( !IP::Send(pkt, &sendfrom.in.sin_addr, &remote.in.sin_addr,
|
|
IPPROTO_TCP, ifindex, false) )
|
|
return false;
|
|
}
|
|
// TODO: IPv6 support.
|
|
else
|
|
return errno = EAFNOSUPPORT, false;
|
|
if ( has_syn )
|
|
recv_acked = recv_nxt;
|
|
recv_wndlast = recv_wnd;
|
|
assert(mod32_le(send_nxtpos, send_nxt));
|
|
send_pos = send_nxtpos;
|
|
}
|
|
if ( any )
|
|
{
|
|
SetDeadline();
|
|
SetTimer();
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void TCPSocket::OnTimer()
|
|
{
|
|
ScopedLock lock(&tcp_lock);
|
|
timer_armed = false;
|
|
if ( 0 <= deadline.tv_sec &&
|
|
timespec_le(deadline, Time::Get(CLOCK_MONOTONIC)) )
|
|
{
|
|
deadline = timespec_make(-1, 0);
|
|
if ( state == TCP_STATE_TIME_WAIT ||
|
|
(state == TCP_STATE_FIN_WAIT_2 && !is_referenced) )
|
|
Close();
|
|
else if ( mod32_lt(send_una, send_pos) )
|
|
{
|
|
retransmissions++;
|
|
send_pos = send_una;
|
|
}
|
|
}
|
|
transmit_scheduled = false;
|
|
TransmitLoop();
|
|
if ( 0 <= deadline.tv_sec )
|
|
SetTimer();
|
|
if ( can_destroy() )
|
|
delete this;
|
|
}
|
|
|
|
void TCPSocket::ScheduleTransmit() // tcp_lock locked
|
|
{
|
|
if ( state == TCP_STATE_CLOSED || state == TCP_STATE_LISTEN )
|
|
return;
|
|
if ( transmit_scheduled && timer_armed )
|
|
return;
|
|
transmit_scheduled = true;
|
|
SetTimer();
|
|
|
|
}
|
|
|
|
void TCPSocket::SetDeadline() // tcp_lock locked
|
|
{
|
|
if ( 0 <= deadline.tv_sec )
|
|
return;
|
|
if ( state == TCP_STATE_TIME_WAIT ||
|
|
(state == TCP_STATE_FIN_WAIT_2 && !is_referenced) )
|
|
{
|
|
struct timespec now = Time::Get(CLOCK_MONOTONIC);
|
|
struct timespec msl2 = timespec_make(60, 0); // Documented in tcp(4).
|
|
deadline = timespec_add(now, msl2);
|
|
}
|
|
else if ( mod32_le(send_una, send_pos) )
|
|
{
|
|
struct timespec now = Time::Get(CLOCK_MONOTONIC);
|
|
struct timespec delay = timespec_make(1 + 1 * retransmissions, 0);
|
|
deadline = timespec_add(now, delay);
|
|
}
|
|
}
|
|
|
|
void TCPSocket::SetTimer() // tcp_lock locked
|
|
{
|
|
if ( timer_armed )
|
|
{
|
|
if ( !timer.TryCancel() )
|
|
return;
|
|
timer_armed = false;
|
|
}
|
|
if ( state == TCP_STATE_CLOSED )
|
|
return;
|
|
bool destruction_is_wanted = want_destruction();
|
|
if ( transmit_scheduled || destruction_is_wanted || 0 <= deadline.tv_sec )
|
|
{
|
|
int flags = TIMER_FUNC_MAY_DEALLOCATE_TIMER;
|
|
struct itimerspec timeout;
|
|
memset(&timeout, 0, sizeof(timeout));
|
|
// Slightly delay transmission to batch together a better reply.
|
|
if ( transmit_scheduled )
|
|
timeout.it_value = timespec_make(0, 1);
|
|
else if ( 0 <= deadline.tv_sec )
|
|
{
|
|
timeout.it_value = deadline;
|
|
flags |= TIMER_ABSOLUTE;
|
|
}
|
|
timer.Set(&timeout, NULL, flags, TCPSocket__OnTimer, this);
|
|
timer_armed = true;
|
|
}
|
|
}
|
|
|
|
void TCPSocket::ProcessPacket(Ref<Packet> pkt,
|
|
union tcp_sockaddr* pkt_src,
|
|
union tcp_sockaddr* pkt_dst) // tcp_lock locked
|
|
{
|
|
const unsigned char* in = pkt->from + pkt->offset;
|
|
size_t inlen = pkt->length - pkt->offset;
|
|
struct tcphdr hdr;
|
|
memcpy(&hdr, in, sizeof(hdr));
|
|
hdr.th_sport = be16toh(hdr.th_sport);
|
|
hdr.th_dport = be16toh(hdr.th_dport);
|
|
hdr.th_seq = be32toh(hdr.th_seq);
|
|
hdr.th_ack = be32toh(hdr.th_ack);
|
|
hdr.th_win = be16toh(hdr.th_win);
|
|
hdr.th_urp = be16toh(hdr.th_urp);
|
|
size_t offset = TCP_OFFSET_DECODE(hdr.th_offset) * 4;
|
|
in += offset;
|
|
inlen -= offset;
|
|
if ( state == TCP_STATE_CLOSED ) // STD 7, RFC 793, page 65.
|
|
{
|
|
if ( hdr.th_flags & TH_RST )
|
|
return;
|
|
// TODO: ACK the RST.
|
|
return;
|
|
}
|
|
else if ( state == TCP_STATE_LISTEN ) // STD 7, RFC 793, page 65.
|
|
{
|
|
if ( hdr.th_flags & TH_RST )
|
|
return;
|
|
if ( hdr.th_flags & TH_ACK )
|
|
{
|
|
// TODO: Send <SEQ=SEG.ACK><CTL=RST>.
|
|
return;
|
|
}
|
|
if ( !(hdr.th_flags & TH_SYN) )
|
|
return;
|
|
if ( !hdr.th_win )
|
|
return;
|
|
if ( backlog_max <= backlog_used )
|
|
return;
|
|
// TODO: Use SYN cache to mitigate SYN flood attack.
|
|
TCPSocket* socket = new TCPSocket(af);
|
|
if ( !socket )
|
|
return;
|
|
assert(pkt_src);
|
|
assert(pkt_dst);
|
|
socket->remote = *pkt_src;
|
|
socket->local = *pkt_dst;
|
|
socket->remoted = true;
|
|
socket->bound = true;
|
|
if ( af == AF_INET )
|
|
{
|
|
uint16_t port = be16toh(socket->local.in.sin_port);
|
|
socket->prev_socket = NULL;
|
|
socket->next_socket = bindings_v4[port];
|
|
if ( socket->next_socket )
|
|
socket->next_socket->prev_socket = socket;
|
|
bindings_v4[port] = socket;
|
|
}
|
|
else if ( af == AF_INET6 )
|
|
{
|
|
uint16_t port = be16toh(socket->local.in6.sin6_port);
|
|
socket->prev_socket = NULL;
|
|
socket->next_socket = bindings_v6[port];
|
|
if ( socket->next_socket )
|
|
socket->next_socket->prev_socket = socket;
|
|
bindings_v6[port] = socket;
|
|
}
|
|
socket->iss = arc4random();
|
|
socket->send_una = socket->iss;
|
|
socket->send_nxt = socket->iss;
|
|
socket->send_wnd = 1;
|
|
socket->send_pos = socket->iss;
|
|
socket->outgoing_syn = TCP_SPECIAL_PENDING;
|
|
socket->recv_wnd = TCP_MAXWIN;
|
|
socket->recv_acked = hdr.th_seq;
|
|
socket->recv_nxt = hdr.th_seq + 1;
|
|
socket->irs = hdr.th_seq;
|
|
socket->has_syn = true;
|
|
socket->state = TCP_STATE_SYN_RECV;
|
|
socket->UpdateWindow(hdr.th_win);
|
|
socket->connecting_parent = this;
|
|
socket->connecting_prev = NULL;
|
|
socket->connecting_next = connecting_half;
|
|
if ( socket->connecting_next )
|
|
socket->connecting_next->connecting_prev = socket;
|
|
connecting_half = socket;
|
|
backlog_used++;
|
|
socket->TransmitLoop();
|
|
return;
|
|
}
|
|
else if ( state == TCP_STATE_SYN_SENT ) // STD 7, RFC 793, page 66.
|
|
{
|
|
if ( hdr.th_flags & TH_ACK )
|
|
{
|
|
if ( mod32_le(hdr.th_ack, iss) || mod32_gt(hdr.th_ack, send_nxt) )
|
|
{
|
|
if ( hdr.th_flags & TH_RST )
|
|
return;
|
|
// TODO: Send RST.
|
|
return;
|
|
}
|
|
if ( !(mod32_le(send_una, hdr.th_ack) &&
|
|
mod32_le(hdr.th_ack, send_nxt)) )
|
|
return;
|
|
}
|
|
if ( hdr.th_flags & TH_RST )
|
|
{
|
|
Fail(ECONNREFUSED);
|
|
return;
|
|
}
|
|
if ( !(hdr.th_flags & TH_SYN) )
|
|
return;
|
|
recv_acked = hdr.th_seq;
|
|
recv_nxt = hdr.th_seq + 1;
|
|
irs = hdr.th_seq;
|
|
has_syn = true;
|
|
// RFC 1122 4.2.2.20 (c), page 94.
|
|
UpdateWindow(hdr.th_win);
|
|
send_wl1 = hdr.th_seq;
|
|
send_wl2 = hdr.th_ack;
|
|
// TODO: Drop packet if the packet contains data/FIN beyond the SYN?
|
|
if ( hdr.th_flags & TH_ACK )
|
|
{
|
|
send_una = hdr.th_ack;
|
|
retransmissions = 0;
|
|
deadline = timespec_make(-1, 0);
|
|
SetDeadline();
|
|
SetTimer();
|
|
if ( mod32_lt(iss, send_una) )
|
|
{
|
|
outgoing_syn = TCP_SPECIAL_ACKED;
|
|
state = TCP_STATE_ESTAB;
|
|
kthread_cond_broadcast(&receive_cond); // Wake up connect.
|
|
return;
|
|
}
|
|
}
|
|
state = TCP_STATE_SYN_RECV;
|
|
return;
|
|
}
|
|
// STD 7, RFC 793, page 69.
|
|
bool acceptable;
|
|
if ( inlen == 0 && recv_wnd == 0 )
|
|
acceptable = hdr.th_seq == recv_nxt;
|
|
else if ( inlen == 0 && 0 < recv_wnd )
|
|
acceptable = mod32_le(recv_nxt, hdr.th_seq) &&
|
|
mod32_lt(hdr.th_seq, recv_nxt + recv_wnd);
|
|
else if ( 0 < inlen && 0 < recv_wnd )
|
|
{
|
|
tcp_seq seg_end = (tcp_seq) (hdr.th_seq + inlen - 1);
|
|
acceptable = (mod32_le(recv_nxt, hdr.th_seq) &&
|
|
mod32_lt(hdr.th_seq, recv_nxt + recv_wnd)) ||
|
|
(mod32_le(recv_nxt, seg_end) &&
|
|
mod32_lt(seg_end, recv_nxt + recv_wnd));
|
|
}
|
|
else
|
|
{
|
|
acceptable = false;
|
|
// TODO: STD 7, RFC 793, page 69 "If the RCV.WND is zero, no segments
|
|
// will be acceptable, but special allowance should be made to
|
|
// accept valid ACKs, URGs and RSTs".
|
|
}
|
|
if ( !acceptable )
|
|
{
|
|
if ( hdr.th_flags & TH_RST )
|
|
return;
|
|
// Send <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>.
|
|
recv_acked = recv_nxt - 1;
|
|
return;
|
|
}
|
|
// STD 7, RFC 793, page 70. Process segments in the right order and trim the
|
|
// segment to the receive window.
|
|
uint16_t real_seq = hdr.th_seq;
|
|
if ( mod32_lt(hdr.th_seq, recv_nxt) && (hdr.th_flags & TH_SYN) )
|
|
{
|
|
hdr.th_flags &= ~TH_SYN;
|
|
hdr.th_seq++;
|
|
}
|
|
if ( mod32_lt(hdr.th_seq, recv_nxt) )
|
|
{
|
|
tcp_seq skip = recv_nxt - hdr.th_seq;
|
|
if ( inlen < skip )
|
|
skip = inlen;
|
|
hdr.th_seq += skip;
|
|
in += skip;
|
|
inlen -= skip;
|
|
}
|
|
if ( mod32_lt(hdr.th_seq, recv_nxt) && (hdr.th_flags & TH_FIN) )
|
|
{
|
|
hdr.th_flags &= ~TH_FIN;
|
|
hdr.th_seq++;
|
|
}
|
|
if ( mod32_lt(hdr.th_seq, recv_nxt) ) // Already processes.
|
|
return;
|
|
if ( mod32_gt(hdr.th_seq, recv_nxt) ) // Can't process yet.
|
|
{
|
|
// Insert the segment in the receive queue.
|
|
Ref<Packet> prev;
|
|
Ref<Packet> iter = receive_queue;
|
|
// TODO: For n packets in the worst order, this scales O(n^2).
|
|
// TODO: This wastes a packet per byte in the worst case.
|
|
while ( iter )
|
|
{
|
|
const unsigned char* iter_in = iter->from + iter->offset;
|
|
const unsigned char* iter_in_seq =
|
|
iter_in + offsetof(struct tcphdr, th_seq);
|
|
tcp_seq iter_seq;
|
|
memcpy(&iter_seq, iter_in_seq, sizeof(iter_seq));
|
|
iter_seq = be32toh(iter_seq);
|
|
if ( mod32_le(real_seq, iter_seq) )
|
|
break;
|
|
// TODO: Handle duplicate and overlapping segments.
|
|
prev = iter;
|
|
iter = iter->next;
|
|
}
|
|
if ( prev )
|
|
{
|
|
pkt->next = prev->next;
|
|
prev->next = pkt;
|
|
}
|
|
else
|
|
{
|
|
pkt->next = receive_queue;
|
|
receive_queue = pkt;
|
|
}
|
|
return;
|
|
}
|
|
if ( recv_wnd < inlen )
|
|
inlen = recv_wnd;
|
|
// STD 7, RFC 793, page 70.
|
|
if ( hdr.th_flags & TH_RST )
|
|
{
|
|
if ( state == TCP_STATE_CLOSING ||
|
|
state == TCP_STATE_LAST_ACK ||
|
|
state == TCP_STATE_TIME_WAIT )
|
|
Close();
|
|
else
|
|
Fail(ECONNRESET);
|
|
return;
|
|
}
|
|
// STD 7, RFC 793, page 71.
|
|
if ( hdr.th_flags & TH_SYN )
|
|
{
|
|
// TODO: Send RST.
|
|
Fail(ECONNRESET);
|
|
return;
|
|
}
|
|
// STD 7, RFC 793, page 72.
|
|
if ( !(hdr.th_flags & TH_ACK) )
|
|
return;
|
|
// STD 7, RFC 793, page 72.
|
|
if ( state == TCP_STATE_SYN_RECV )
|
|
{
|
|
// RFC 1122 4.2.2.20 (f), page 94.
|
|
UpdateWindow(hdr.th_win);
|
|
send_wl1 = hdr.th_seq;
|
|
send_wl2 = hdr.th_ack;
|
|
if ( mod32_le(send_una, hdr.th_ack) && mod32_le(hdr.th_ack, send_nxt) )
|
|
{
|
|
state = TCP_STATE_ESTAB;
|
|
kthread_cond_broadcast(&receive_cond); // Wake up connect.
|
|
if ( connecting_parent )
|
|
{
|
|
if ( connecting_prev )
|
|
connecting_prev->connecting_next = connecting_next;
|
|
else
|
|
connecting_parent->connecting_half = connecting_next;
|
|
if ( connecting_next )
|
|
connecting_next->connecting_prev = connecting_prev;
|
|
// TODO: This inserts the connection to the front of the
|
|
// accept queue, rather than the end, which is unfair to
|
|
// connections that have been waiting longer.
|
|
connecting_prev = NULL;
|
|
connecting_next = connecting_parent->connecting_ready;
|
|
if ( connecting_next )
|
|
connecting_next->connecting_prev = this;
|
|
connecting_parent->connecting_ready = this;
|
|
kthread_cond_broadcast(&connecting_parent->receive_cond);
|
|
uint16_t status = connecting_parent->PollEventStatus();
|
|
connecting_parent->poll_channel.Signal(status);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// TODO: Send <SEQ=SEG.ACK><CTL=RST>.
|
|
TransmitLoop();
|
|
return;
|
|
}
|
|
}
|
|
// STD 7, RFC 793, page 72.
|
|
// TODO: RFC 1122 4.2.2.20 (g), page 94 says SEG.ACK =< SND.UNA however this
|
|
// causes incoming connections to fail.
|
|
if ( mod32_lt(hdr.th_ack, send_una) )
|
|
return; // Drop duplicate ack already seen.
|
|
else if ( mod32_lt(send_nxt, hdr.th_ack) )
|
|
{
|
|
// TODO: Send ACK.
|
|
return;
|
|
}
|
|
// STD 7, RFC 793, page 72. Remove acknowledged data from the window.
|
|
tcp_seq old_send_una = send_una;
|
|
tcp_seq acked = (tcp_seq) (hdr.th_ack - send_una);
|
|
if ( outgoing_syn == TCP_SPECIAL_WINDOW && 0 < acked )
|
|
{
|
|
outgoing_syn = TCP_SPECIAL_ACKED;
|
|
acked--;
|
|
send_una++;
|
|
}
|
|
tcp_seq window_data = (tcp_seq) (send_nxt - send_una);
|
|
if ( outgoing_fin == TCP_SPECIAL_WINDOW )
|
|
window_data--;
|
|
if ( window_data && acked )
|
|
{
|
|
size_t amount = window_data < acked ? window_data : acked;
|
|
assert(outgoing_offset < sizeof(outgoing));
|
|
outgoing_offset += amount;
|
|
if ( sizeof(outgoing) <= outgoing_offset )
|
|
outgoing_offset -= sizeof(outgoing);
|
|
assert(outgoing_offset < sizeof(outgoing));
|
|
assert(amount <= outgoing_used);
|
|
outgoing_used -= amount;
|
|
kthread_cond_broadcast(&transmit_cond);
|
|
poll_channel.Signal(PollEventStatus());
|
|
acked -= amount;
|
|
send_una += amount;
|
|
}
|
|
bool fin_was_acked = false;
|
|
if ( outgoing_fin == TCP_SPECIAL_WINDOW && 0 < acked )
|
|
{
|
|
outgoing_fin = TCP_SPECIAL_ACKED;
|
|
acked--;
|
|
send_una++;
|
|
fin_was_acked = true;
|
|
}
|
|
if ( send_una != old_send_una )
|
|
{
|
|
// TODO: Possibly recalculate the average time to contact remote.
|
|
retransmissions = 0;
|
|
SetTimer();
|
|
}
|
|
// STD 7, RFC 793, page 72.
|
|
if ( mod32_lt(send_wl1, hdr.th_seq) ||
|
|
(send_wl1 == hdr.th_seq && mod32_le(send_wl2, hdr.th_ack)) )
|
|
{
|
|
UpdateWindow(hdr.th_win);
|
|
send_wl1 = hdr.th_seq;
|
|
send_wl2 = hdr.th_ack;
|
|
}
|
|
// STD 7, RFC 793, page 73.
|
|
if ( state == TCP_STATE_FIN_WAIT_1 )
|
|
{
|
|
if ( fin_was_acked )
|
|
{
|
|
state = TCP_STATE_FIN_WAIT_2;
|
|
// Time out the connection if the socket is no longer referenced.
|
|
if ( !is_referenced )
|
|
{
|
|
deadline = timespec_make(-1, 0);
|
|
SetDeadline();
|
|
SetTimer();
|
|
}
|
|
}
|
|
}
|
|
else if ( state == TCP_STATE_CLOSING )
|
|
{
|
|
if ( fin_was_acked )
|
|
{
|
|
state = TCP_STATE_TIME_WAIT;
|
|
deadline = timespec_make(-1, 0);
|
|
SetDeadline();
|
|
SetTimer();
|
|
}
|
|
return;
|
|
}
|
|
else if ( state == TCP_STATE_LAST_ACK )
|
|
{
|
|
if ( fin_was_acked )
|
|
Close();
|
|
return;
|
|
}
|
|
// TODO: Urgent data per STD 7, RFC 793, page 73.
|
|
// STD 7, RFC 793, page 74.
|
|
if ( state == TCP_STATE_ESTAB ||
|
|
state == TCP_STATE_FIN_WAIT_1 ||
|
|
state == TCP_STATE_FIN_WAIT_2 )
|
|
{
|
|
assert(incoming_offset < sizeof(incoming));
|
|
assert(incoming_used <= sizeof(incoming));
|
|
size_t available = sizeof(incoming) - incoming_used;
|
|
size_t amount = available < inlen ? available : inlen;
|
|
assert(amount <= sizeof(incoming));
|
|
assert(amount <= available);
|
|
size_t newat = incoming_offset + incoming_used;
|
|
if ( sizeof(incoming) <= newat )
|
|
newat -= sizeof(incoming);
|
|
assert(newat < sizeof(incoming));
|
|
size_t until_end = sizeof(incoming) - newat;
|
|
assert(until_end <= sizeof(incoming));
|
|
size_t first = until_end < amount ? until_end : amount;
|
|
assert(first <= amount);
|
|
assert(first <= sizeof(incoming));
|
|
size_t second = amount - first;
|
|
assert(second <= amount);
|
|
assert(second <= sizeof(incoming));
|
|
assert(first + second == amount);
|
|
assert(first + second <= sizeof(incoming));
|
|
assert(first + second <= available);
|
|
if ( !shutdown_receive )
|
|
{
|
|
memcpy(incoming + newat, in, first);
|
|
if ( second )
|
|
memcpy(incoming, in + first, second);
|
|
incoming_used += amount;
|
|
}
|
|
available = sizeof(incoming) - incoming_used;
|
|
if ( available < recv_wnd )
|
|
recv_wnd = available;
|
|
recv_nxt = hdr.th_seq + amount;
|
|
if ( amount == inlen && (hdr.th_flags & TH_FIN) )
|
|
{
|
|
recv_nxt++;
|
|
has_fin = true;
|
|
}
|
|
if ( incoming_used || has_fin )
|
|
{
|
|
kthread_cond_broadcast(&receive_cond);
|
|
poll_channel.Signal(PollEventStatus());
|
|
}
|
|
}
|
|
// STD 7, RFC 793, page 75.
|
|
if ( hdr.th_flags & TH_FIN )
|
|
{
|
|
if ( state == TCP_STATE_ESTAB )
|
|
{
|
|
state = TCP_STATE_CLOSE_WAIT;
|
|
kthread_cond_broadcast(&receive_cond);
|
|
poll_channel.Signal(PollEventStatus());
|
|
}
|
|
else if ( state == TCP_STATE_FIN_WAIT_1 )
|
|
{
|
|
// Our sent FIN hasn't been ACK'd or we'd be in FIN_WAIT_2.
|
|
state = TCP_STATE_CLOSING;
|
|
}
|
|
else if ( state == TCP_STATE_FIN_WAIT_2 )
|
|
{
|
|
state = TCP_STATE_TIME_WAIT;
|
|
deadline = timespec_make(-1, 0);
|
|
SetDeadline();
|
|
SetTimer();
|
|
}
|
|
else if ( state == TCP_STATE_TIME_WAIT )
|
|
{
|
|
// The timer is not reset like as by the standard to avoid a hostile
|
|
// remote from staying forever in TIME-WAIT.
|
|
}
|
|
}
|
|
}
|
|
|
|
void TCPSocket::ReceivePacket(Ref<Packet> pktnew,
|
|
union tcp_sockaddr* pkt_src,
|
|
union tcp_sockaddr* pkt_dst) // tcp_lock locked
|
|
{
|
|
if ( pktnew )
|
|
ProcessPacket(pktnew, pkt_src, pkt_dst);
|
|
while ( receive_queue )
|
|
{
|
|
Ref<Packet> pkt = receive_queue;
|
|
const unsigned char* in = pkt->from + pkt->offset;
|
|
const unsigned char* in_seq = in + offsetof(struct tcphdr, th_seq);
|
|
tcp_seq seq;
|
|
memcpy(&seq, in_seq, sizeof(seq));
|
|
seq = be32toh(seq);
|
|
if ( mod32_gt(seq, recv_nxt) )
|
|
break;
|
|
receive_queue = pkt->next;
|
|
pkt->next.Reset();
|
|
if ( seq == recv_nxt )
|
|
ProcessPacket(pkt, pkt_src, pkt_dst);
|
|
}
|
|
// Delay transmit to answer more efficiently based on upcoming packets.
|
|
ScheduleTransmit();
|
|
}
|
|
|
|
void TCPSocket::UpdateWindow(uint16_t new_window)
|
|
{
|
|
tcp_seq pending = (tcp_seq) (send_nxt - send_una);
|
|
if ( new_window < pending )
|
|
send_nxt = (tcp_seq) (send_una + pending);
|
|
send_wnd = new_window;
|
|
}
|
|
|
|
int TCPSocket::connect(ioctx_t* ctx, const uint8_t* addr, size_t addrsize)
|
|
{
|
|
ScopedLock lock(&tcp_lock);
|
|
// TODO: os-test listen + connect, what errno?
|
|
if ( state == TCP_STATE_SYN_SENT || state == TCP_STATE_SYN_RECV )
|
|
return errno = EALREADY, -1;
|
|
if ( state != TCP_STATE_CLOSED )
|
|
return errno = EISCONN, -1; // TODO: Another errno if listening?
|
|
union tcp_sockaddr new_remote;
|
|
if ( !ImportAddress(ctx, &new_remote, addr, addrsize) )
|
|
return -1;
|
|
if ( af == AF_INET )
|
|
{
|
|
// Verify the port is non-zero.
|
|
if ( be16toh(new_remote.in.sin_port) == 0 )
|
|
return errno = EADDRNOTAVAIL, -1;
|
|
}
|
|
else
|
|
return errno = EAFNOSUPPORT, -1;
|
|
// TODO: os-test AF_UNSPEC
|
|
// If the socket is not bound, find a route to the remote address and bind
|
|
// to the appropriate source address.
|
|
if ( !bound )
|
|
{
|
|
union tcp_sockaddr new_local;
|
|
memset(&new_local, 0, sizeof(new_local));
|
|
if ( af == AF_INET )
|
|
{
|
|
struct in_addr any;
|
|
any.s_addr = htobe32(INADDR_ANY);
|
|
new_local.in.sin_family = AF_INET;
|
|
if ( !IP::GetSourceIP(&any, &new_remote.in.sin_addr,
|
|
&new_local.in.sin_addr, ifindex, NULL) )
|
|
return -1;
|
|
new_local.in.sin_port = htobe16(0);
|
|
}
|
|
else
|
|
return errno = EAFNOSUPPORT, -1;
|
|
if ( !BindDefault(&new_local) )
|
|
return -1;
|
|
}
|
|
// Test if there is a route from the local address to the remote address.
|
|
// TODO: Does TCP also do this? Note that connecting to the any address
|
|
// should be forbidden, right?
|
|
if ( af == AF_INET )
|
|
{
|
|
if ( !IP::GetSourceIP(&local.in.sin_addr, &new_remote.in.sin_addr,
|
|
NULL, ifindex, NULL) )
|
|
return -1;
|
|
}
|
|
else
|
|
return errno = EAFNOSUPPORT, -1;
|
|
memcpy(&remote, &new_remote, sizeof(new_remote));
|
|
remoted = true;
|
|
iss = arc4random();
|
|
recv_wnd = TCP_MAXWIN;
|
|
send_una = iss;
|
|
send_nxt = iss;
|
|
send_wnd = 1;
|
|
send_pos = iss;
|
|
outgoing_syn = TCP_SPECIAL_PENDING;
|
|
state = TCP_STATE_SYN_SENT;
|
|
TransmitLoop();
|
|
while ( !sockerr &&
|
|
(state == TCP_STATE_SYN_SENT || state == TCP_STATE_SYN_RECV) )
|
|
{
|
|
// TODO: os-test non-blocking connect.
|
|
if ( ctx->dflags & O_NONBLOCK )
|
|
return errno = EINPROGRESS, -1;
|
|
if ( !kthread_cond_wait_signal(&receive_cond, &tcp_lock) )
|
|
return errno = EINTR, -1;
|
|
}
|
|
if ( sockerr )
|
|
{
|
|
// TODO: This is not recoverable. Is that correct?
|
|
// TODO: os-test whether reconnect is possible after failed connect?
|
|
return errno = sockerr, -1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
int TCPSocket::listen(ioctx_t* /*ctx*/, int backlog)
|
|
{
|
|
if ( backlog < 0 )
|
|
return errno = EINVAL, -1;
|
|
// TODO: os-test if zero backlog allows connections.
|
|
if ( backlog == 0 )
|
|
backlog = 1;
|
|
else if ( backlog < 0 || SOMAXCONN < backlog )
|
|
backlog = SOMAXCONN;
|
|
ScopedLock lock(&tcp_lock);
|
|
if ( !bound )
|
|
return errno = EDESTADDRREQ, -1;
|
|
// TODO: os-test a regular connection, close, and then try to listen.
|
|
// TODO: os-test listen twice.
|
|
if ( state != TCP_STATE_CLOSED && state != TCP_STATE_LISTEN )
|
|
return errno = EINVAL, -1;
|
|
backlog_max = backlog;
|
|
memset(&remote, 0, sizeof(remote));
|
|
if ( af == AF_INET )
|
|
{
|
|
remote.in.sin_family = AF_INET;
|
|
remote.in.sin_addr.s_addr = htobe32(INADDR_ANY);
|
|
}
|
|
else if ( af == AF_INET6 )
|
|
{
|
|
remote.in6.sin6_family = AF_INET6;
|
|
remote.in6.sin6_addr = in6addr_any;
|
|
}
|
|
else
|
|
return errno = EAFNOSUPPORT, -1;
|
|
remoted = true;
|
|
state = TCP_STATE_LISTEN;
|
|
return 0;
|
|
}
|
|
|
|
ssize_t TCPSocket::recv(ioctx_t* ctx, uint8_t* buf, size_t count, int flags)
|
|
{
|
|
ScopedLock lock(&tcp_lock);
|
|
ssize_t result = recv_unlocked(ctx, buf, count, flags);
|
|
// Respond immediately if receive window has become empty.
|
|
!incoming_used ? TransmitLoop() : ScheduleTransmit();
|
|
return result;
|
|
}
|
|
|
|
ssize_t TCPSocket::recvmsg(ioctx_t* ctx, struct msghdr* msg_ptr, int flags)
|
|
{
|
|
struct msghdr msg;
|
|
if ( !ctx->copy_from_src(&msg, msg_ptr, sizeof(msg)) )
|
|
return -1;
|
|
if ( msg.msg_iovlen < 0 || IOV_MAX < msg.msg_iovlen )
|
|
return errno = EINVAL, -1;
|
|
size_t iov_size = msg.msg_iovlen * sizeof(struct iovec);
|
|
struct iovec* iov = new struct iovec[msg.msg_iovlen];
|
|
if ( !iov )
|
|
return -1;
|
|
struct iovec* user_iov = msg.msg_iov;
|
|
if ( !ctx->copy_from_src(iov, user_iov, iov_size) )
|
|
return delete[] iov, -1;
|
|
msg.msg_iov = iov;
|
|
kthread_mutex_lock(&tcp_lock);
|
|
ssize_t result = 0;
|
|
for ( int i = 0; i < msg.msg_iovlen && result < SSIZE_MAX; i++ )
|
|
{
|
|
size_t maximum = SSIZE_MAX - (size_t) result;
|
|
uint8_t* buf = (uint8_t*) iov[i].iov_base;
|
|
size_t count = iov[i].iov_len < maximum ? iov[i].iov_len : maximum;
|
|
if ( !count )
|
|
continue;
|
|
ssize_t amount = recv_unlocked(ctx, buf, count, flags);
|
|
if ( amount < 0 )
|
|
{
|
|
if ( result == 0 )
|
|
result = -1;
|
|
break;
|
|
}
|
|
result += amount;
|
|
if ( (size_t) amount != count )
|
|
break;
|
|
}
|
|
// Respond immediately if receive window has become empty.
|
|
!incoming_used ? TransmitLoop() : ScheduleTransmit();
|
|
kthread_mutex_unlock(&tcp_lock);
|
|
msg.msg_iov = user_iov;
|
|
// TODO: os-test POSIX's requirement to ignore the namemsg_name
|
|
// msg_namelen, plus msg_controllen's behavior is unspecified.
|
|
msg.msg_namelen = 0;
|
|
msg.msg_controllen = 0;
|
|
delete[] iov;
|
|
if ( !ctx->copy_to_dest(msg_ptr, &msg, sizeof(msg)) )
|
|
return -1;
|
|
return result;
|
|
}
|
|
|
|
ssize_t TCPSocket::recv_unlocked(ioctx_t* ctx,
|
|
uint8_t* buf,
|
|
size_t count,
|
|
int flags) // tcp_lock taken
|
|
{
|
|
if ( flags & ~(MSG_PEEK | MSG_WAITALL) ) // TODO: MSG_OOB.
|
|
return errno = EINVAL, -1;
|
|
if ( sockerr )
|
|
return errno = sockerr, -1;
|
|
// TODO: os-test non-blocking connect + immediate recv.
|
|
// TODO: CLOSED after it has been closed?
|
|
if ( state == TCP_STATE_CLOSED ||
|
|
state == TCP_STATE_LISTEN ||
|
|
state == TCP_STATE_SYN_SENT ||
|
|
state == TCP_STATE_SYN_RECV )
|
|
return errno = ENOTCONN, -1;
|
|
size_t sofar = 0;
|
|
while ( sofar < count )
|
|
{
|
|
while ( !(incoming_used || has_fin || shutdown_receive) )
|
|
{
|
|
if ( sockerr )
|
|
return sofar ? sofar : (errno = sockerr, -1);
|
|
if ( state == TCP_STATE_CLOSED )
|
|
return sofar;
|
|
if ( sofar && !(flags & MSG_WAITALL) )
|
|
return sofar;
|
|
if ( ctx->dflags & O_NONBLOCK )
|
|
return sofar ? sofar : (errno = EWOULDBLOCK, -1);
|
|
if ( !kthread_cond_wait_signal(&receive_cond, &tcp_lock) )
|
|
return sofar ? sofar : (errno = EINTR, -1);
|
|
if ( sockerr )
|
|
return sofar ? sofar : (errno = sockerr, -1);
|
|
}
|
|
if ( incoming_used == 0 && (has_fin || shutdown_receive) )
|
|
return sofar;
|
|
uint8_t* data = buf + sofar;
|
|
size_t left = count - sofar;
|
|
assert(incoming_used <= sizeof(incoming));
|
|
size_t amount = incoming_used < left ? incoming_used : left;
|
|
assert(incoming_offset < sizeof(incoming));
|
|
size_t until_end = sizeof(incoming) - incoming_offset;
|
|
size_t first = until_end < amount ? until_end : amount;
|
|
size_t second = amount - first;
|
|
if ( !ctx->copy_to_dest(data, incoming + incoming_offset, first) )
|
|
return sofar ? sofar : -1;
|
|
if ( second && !ctx->copy_to_dest(data + first, incoming, second) )
|
|
return sofar ? sofar : -1;
|
|
sofar += amount;
|
|
if ( flags & MSG_PEEK )
|
|
return sofar;
|
|
incoming_offset += amount;
|
|
if ( sizeof(incoming) <= incoming_offset )
|
|
incoming_offset -= sizeof(incoming);
|
|
assert(incoming_offset < sizeof(incoming));
|
|
incoming_used -= amount;
|
|
recv_wnd = sizeof(incoming) - incoming_used;
|
|
if ( UINT16_MAX < recv_wnd )
|
|
recv_wnd = UINT16_MAX;
|
|
if ( TCP_MAXWIN < recv_wnd )
|
|
recv_wnd = TCP_MAXWIN;
|
|
}
|
|
return sofar;
|
|
}
|
|
|
|
ssize_t TCPSocket::send(ioctx_t* ctx,
|
|
const uint8_t* buf,
|
|
size_t count,
|
|
int flags)
|
|
{
|
|
ScopedLock lock(&tcp_lock);
|
|
ssize_t result = send_unlocked(ctx, buf, count, flags);
|
|
TransmitLoop();
|
|
return result;
|
|
}
|
|
|
|
ssize_t TCPSocket::sendmsg(ioctx_t* ctx,
|
|
const struct msghdr* msg_ptr,
|
|
int flags)
|
|
{
|
|
struct msghdr msg;
|
|
if ( !ctx->copy_from_src(&msg, msg_ptr, sizeof(msg)) )
|
|
return -1;
|
|
if ( msg.msg_iovlen < 0 || IOV_MAX < msg.msg_iovlen )
|
|
return errno = EINVAL, -1;
|
|
// TODO: os-test if msg_name/msg_namelen/msg_control/msg_controllen are set.
|
|
size_t iov_size = msg.msg_iovlen * sizeof(struct iovec);
|
|
struct iovec* iov = new struct iovec[msg.msg_iovlen];
|
|
if ( !iov )
|
|
return -1;
|
|
if ( !ctx->copy_from_src(iov, msg.msg_iov, iov_size) )
|
|
return delete[] iov, -1;
|
|
msg.msg_iov = iov;
|
|
kthread_mutex_lock(&tcp_lock);
|
|
ssize_t result = 0;
|
|
for ( int i = 0; i < msg.msg_iovlen && result < SSIZE_MAX; i++ )
|
|
{
|
|
size_t maximum = SSIZE_MAX - (size_t) result;
|
|
const uint8_t* buf = (const uint8_t*) iov[i].iov_base;
|
|
size_t count = iov[i].iov_len < maximum ? iov[i].iov_len : maximum;
|
|
ssize_t amount = send_unlocked(ctx, buf, count, flags);
|
|
if ( amount < 0 )
|
|
{
|
|
if ( result == 0 )
|
|
result = -1;
|
|
break;
|
|
}
|
|
result += amount;
|
|
if ( (size_t) amount != count )
|
|
break;
|
|
}
|
|
TransmitLoop();
|
|
kthread_mutex_unlock(&tcp_lock);
|
|
delete[] iov;
|
|
return result;
|
|
}
|
|
|
|
ssize_t TCPSocket::send_unlocked(ioctx_t* ctx,
|
|
const uint8_t* buf,
|
|
size_t count,
|
|
int flags) // tcp_lock taken
|
|
{
|
|
// TODO: MSG_MORE (and implement TCP_CORK), MSG_OOB, MSG_DONTROUTE.
|
|
if ( flags & ~(MSG_NOSIGNAL) )
|
|
return errno = EINVAL, -1;
|
|
if ( sockerr )
|
|
return errno = sockerr, -1;
|
|
if ( state == TCP_STATE_CLOSED ||
|
|
state == TCP_STATE_LISTEN ||
|
|
state == TCP_STATE_SYN_SENT ||
|
|
state == TCP_STATE_SYN_RECV )
|
|
return errno = ENOTCONN, -1;
|
|
size_t sofar = 0;
|
|
while ( sofar < count )
|
|
{
|
|
while ( outgoing_used == sizeof(outgoing) ||
|
|
(state != TCP_STATE_ESTAB && state != TCP_STATE_CLOSE_WAIT) )
|
|
{
|
|
if ( sofar )
|
|
return sofar;
|
|
if ( sockerr )
|
|
return errno = sockerr, -1;
|
|
if ( ctx->dflags & O_NONBLOCK )
|
|
return errno = EWOULDBLOCK, -1;
|
|
if ( !kthread_cond_wait_signal(&transmit_cond, &tcp_lock) )
|
|
return errno = EINTR, -1;
|
|
}
|
|
if ( state != TCP_STATE_ESTAB && state != TCP_STATE_CLOSE_WAIT )
|
|
{
|
|
if ( !(flags & MSG_NOSIGNAL) )
|
|
CurrentThread()->DeliverSignal(SIGPIPE);
|
|
return errno = EPIPE, -1;
|
|
}
|
|
const uint8_t* data = buf + sofar;
|
|
size_t left = count - sofar;
|
|
assert(outgoing_offset < sizeof(outgoing));
|
|
assert(outgoing_used <= sizeof(outgoing));
|
|
size_t available = sizeof(outgoing) - outgoing_used;
|
|
size_t amount = available < left ? available : left;
|
|
size_t newat = outgoing_offset + outgoing_used;
|
|
if ( sizeof(outgoing) <= newat )
|
|
newat -= sizeof(outgoing);
|
|
assert(newat < sizeof(outgoing));
|
|
size_t until_end = sizeof(outgoing) - newat;
|
|
size_t first = until_end < amount ? until_end : amount;
|
|
size_t second = amount - first;
|
|
if ( !ctx->copy_from_src(outgoing + newat, data, first) )
|
|
return sofar ? sofar : -1;
|
|
if ( second && !ctx->copy_from_src(outgoing, data + first, second) )
|
|
return sofar ? sofar : -1;
|
|
outgoing_used += amount;
|
|
assert(outgoing_used <= sizeof(outgoing));
|
|
sofar += amount;
|
|
// TODO: If there's a sent packet that hasn't been acknowledged, and
|
|
// there isn't a full packet yet, then just buffer and don't
|
|
// transmit yet.
|
|
// TODO: TCP_NODELAY, TCP_NOPUSH, MSG_MORE.
|
|
// TODO: Set PUSH appropriately.
|
|
}
|
|
return sofar;
|
|
}
|
|
|
|
ssize_t TCPSocket::read(ioctx_t* ctx, uint8_t* buf, size_t count)
|
|
{
|
|
return recv(ctx, buf, count, 0);
|
|
}
|
|
|
|
ssize_t TCPSocket::write(ioctx_t* ctx, const uint8_t* buf, size_t count)
|
|
{
|
|
return send(ctx, buf, count, 0);
|
|
}
|
|
|
|
short TCPSocket::PollEventStatus()
|
|
{
|
|
// TODO: os-test the poll bits.
|
|
// TODO: OOB poll bits.
|
|
short status = 0;
|
|
if ( connecting_ready )
|
|
status |= POLLIN | POLLRDNORM;
|
|
if ( incoming_used || has_fin || shutdown_receive )
|
|
status |= POLLIN | POLLRDNORM;
|
|
if ( (state == TCP_STATE_ESTAB || state == TCP_STATE_CLOSE_WAIT) &&
|
|
outgoing_used < sizeof(outgoing) )
|
|
status |= POLLOUT | POLLWRNORM;
|
|
if ( state == TCP_STATE_CLOSE_WAIT ||
|
|
state == TCP_STATE_LAST_ACK ||
|
|
state == TCP_STATE_TIME_WAIT ||
|
|
state == TCP_STATE_CLOSED )
|
|
status |= POLLHUP;
|
|
if ( sockerr )
|
|
status |= POLLERR;
|
|
return status;
|
|
}
|
|
|
|
int TCPSocket::poll(ioctx_t* /*ctx*/, PollNode* node)
|
|
{
|
|
ScopedLock lock(&tcp_lock);
|
|
short ret_status = PollEventStatus() & node->events;
|
|
if ( ret_status )
|
|
{
|
|
node->master->revents |= ret_status;
|
|
return 0;
|
|
}
|
|
poll_channel.Register(node);
|
|
return errno = EAGAIN, -1;
|
|
}
|
|
|
|
int TCPSocket::getsockopt(ioctx_t* ctx, int level, int option_name,
|
|
void* option_value, size_t* option_size_ptr)
|
|
{
|
|
ScopedLock lock(&tcp_lock);
|
|
|
|
if ( level == SOL_SOCKET && option_name == SO_BINDTODEVICE )
|
|
{
|
|
ScopedLock lock(&netifs_lock);
|
|
const char* ifname = "";
|
|
if ( ifindex < netifs_count && netifs[ifindex] )
|
|
ifname = netifs[ifindex]->ifinfo.name;
|
|
size_t option_size;
|
|
if ( !CopyFromUser(&option_size, option_size_ptr, sizeof(option_size)) )
|
|
return -1;
|
|
size_t len = strlen(ifname);
|
|
size_t size = len + 1;
|
|
if ( option_size < size )
|
|
return errno = ERANGE, -1;
|
|
if ( !CopyToUser(option_value, ifname, size) ||
|
|
!CopyToUser(option_size_ptr, &size, sizeof(size)) )
|
|
return -1;
|
|
return 0;
|
|
}
|
|
|
|
uintmax_t result = 0;
|
|
|
|
if ( level == IPPROTO_TCP )
|
|
{
|
|
switch ( option_name )
|
|
{
|
|
// TODO: TCP_NODELAY
|
|
// TODO: TCP_MAXSEG
|
|
// TODO: TCP_NOPUSH
|
|
// TODO: TCP_CORK
|
|
default: return errno = ENOPROTOOPT, -1;
|
|
}
|
|
}
|
|
else if ( level == SOL_SOCKET )
|
|
{
|
|
switch ( option_name )
|
|
{
|
|
case SO_BINDTOINDEX: result = ifindex; break;
|
|
case SO_DEBUG: result = 0; break;
|
|
case SO_DOMAIN: result = af; break;
|
|
case SO_ERROR: result = sockerr; break;
|
|
case SO_PROTOCOL: result = IPPROTO_TCP; break;
|
|
case SO_RCVBUF: result = sizeof(incoming); break;
|
|
case SO_REUSEADDR: result = reuseaddr; break;
|
|
case SO_SNDBUF: result = sizeof(outgoing); break;
|
|
case SO_TYPE: result = SOCK_STREAM; break;
|
|
// TODO: SO_ACCEPTCONN
|
|
// TODO: SO_LINGER
|
|
// TODO: SO_OOBINLINE
|
|
// TODO: SO_RCVLOWAT
|
|
// TODO: SO_RCVTIMEO
|
|
// TODO: SO_SNDLOWAT
|
|
// TODO: SO_SNDTIMEO
|
|
// TODO: SO_DONTROUTE
|
|
// TODO: SO_BROADCAST
|
|
default: return errno = ENOPROTOOPT, -1;
|
|
}
|
|
}
|
|
else
|
|
return errno = EINVAL, -1;
|
|
|
|
if ( !sockopt_return_uintmax(result, ctx, option_value, option_size_ptr) )
|
|
return -1;
|
|
|
|
return 0;
|
|
}
|
|
|
|
// TODO: os-test socket options on shut down sockets. POSIX says EINVAL.
|
|
// TODO: os-test the errno for an invalid protocol.
|
|
// TODO: os-test the errno for an invalid option at a protocol level.
|
|
|
|
int TCPSocket::setsockopt(ioctx_t* ctx, int level, int option_name,
|
|
const void* option_value, size_t option_size)
|
|
{
|
|
ScopedLock lock(&tcp_lock);
|
|
|
|
if ( level == SOL_SOCKET && option_name == SO_BINDTODEVICE )
|
|
{
|
|
char ifname[IF_NAMESIZE];
|
|
if ( sizeof(ifname) < option_size )
|
|
option_size = sizeof(ifname);
|
|
if ( !CopyFromUser(ifname, option_value, option_size) )
|
|
return -1;
|
|
if ( strnlen(ifname, option_size) == sizeof(ifname) )
|
|
return errno = ENODEV, -1;
|
|
ifname[option_size] = '\0';
|
|
ScopedLock lock(&netifs_lock);
|
|
for ( size_t i = 1; i < netifs_count; i++ )
|
|
{
|
|
if ( netifs[i] && !strcmp(ifname, netifs[i]->ifinfo.name) )
|
|
{
|
|
ifindex = i;
|
|
return 0;
|
|
}
|
|
}
|
|
return errno = ENODEV, -1;
|
|
}
|
|
|
|
uintmax_t value;
|
|
if ( !sockopt_fetch_uintmax(&value, ctx, option_value, option_size) )
|
|
return -1;
|
|
|
|
if ( level == IPPROTO_TCP )
|
|
{
|
|
switch ( option_name )
|
|
{
|
|
case TCP_NODELAY: break; // TODO: Transmit if turned on?
|
|
case TCP_MAXSEG: break; // TODO: Implement this.
|
|
case TCP_NOPUSH: break; // TODO: Implement this.
|
|
// TODO: TCP_CORK
|
|
default:
|
|
return errno = ENOPROTOOPT, -1;
|
|
}
|
|
}
|
|
else if ( level == SOL_SOCKET )
|
|
{
|
|
switch ( option_name )
|
|
{
|
|
case SO_BINDTOINDEX:
|
|
if ( UINT_MAX < value )
|
|
return errno = EINVAL, -1;
|
|
ifindex = value;
|
|
break;
|
|
case SO_DEBUG:
|
|
if ( value != 0 )
|
|
return errno = EPERM, -1;
|
|
break;
|
|
case SO_KEEPALIVE: break; // TODO: Implement this.
|
|
case SO_REUSEADDR: reuseaddr = value; break;
|
|
case SO_LINGER: break; // TODO: Implement this.
|
|
case SO_RCVBUF: break; // TODO: Implement this.
|
|
case SO_SNDBUF: break; // TODO: Implement this.
|
|
// TODO: SO_BROADCAST
|
|
// TODO: SO_DONTROUTE
|
|
// TODO: SO_LINGER
|
|
// TODO: SO_RCVLOWAT
|
|
// TODO: SO_RCVTIMEO
|
|
// TODO: SO_SNDLOWAT
|
|
// TODO: SO_SNDTIMEO
|
|
default: return errno = ENOPROTOOPT, -1;
|
|
}
|
|
}
|
|
else
|
|
return errno = EINVAL, -1;
|
|
|
|
return 0;
|
|
}
|
|
|
|
int TCPSocket::shutdown(ioctx_t* /*ctx*/, int how)
|
|
{
|
|
ScopedLock lock(&tcp_lock);
|
|
return shutdown_unlocked(how);
|
|
}
|
|
|
|
int TCPSocket::shutdown_unlocked(int how) // tcp_lock taken
|
|
{
|
|
// STD 7, RFC 793, page 60.
|
|
if ( state != TCP_STATE_SYN_SENT &&
|
|
state != TCP_STATE_SYN_RECV &&
|
|
state != TCP_STATE_ESTAB &&
|
|
state != TCP_STATE_CLOSE_WAIT )
|
|
return errno = ENOTCONN, -1;
|
|
if ( how & SHUT_WR )
|
|
{
|
|
// STD 7, RFC 793, page 60.
|
|
if ( state == TCP_STATE_SYN_SENT )
|
|
Close();
|
|
else // TCP_STATE_SYN_SENT || TCP_STATE_ESTAB || TCP_STATE_CLOSE_WAIT
|
|
{
|
|
outgoing_fin = TCP_SPECIAL_PENDING;
|
|
// TODO: Should this state transition be delayed until the FIN
|
|
// enters the window or is sent?
|
|
if ( state == TCP_STATE_CLOSE_WAIT )
|
|
state = TCP_STATE_LAST_ACK /* RFC 1122, 4.2.2.20 (a), page 93 */;
|
|
else
|
|
state = TCP_STATE_FIN_WAIT_1;
|
|
kthread_cond_broadcast(&transmit_cond);
|
|
TransmitLoop();
|
|
}
|
|
}
|
|
if ( how & SHUT_RD )
|
|
{
|
|
shutdown_receive = true;
|
|
kthread_cond_broadcast(&receive_cond);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
int TCPSocket::getpeername(ioctx_t* ctx, uint8_t* addr, size_t* addrsize_ptr)
|
|
{
|
|
ScopedLock lock(&tcp_lock);
|
|
if ( !remoted || state == TCP_STATE_LISTEN )
|
|
return errno = ENOTCONN, -1;
|
|
size_t addrsize;
|
|
if ( !ctx->copy_from_src(&addrsize, addrsize_ptr, sizeof(addrsize)) )
|
|
return -1;
|
|
if ( af == AF_INET )
|
|
{
|
|
if ( sizeof(remote.in) < addrsize )
|
|
addrsize = sizeof(remote.in);
|
|
}
|
|
else if ( af == AF_INET6 )
|
|
{
|
|
if ( sizeof(remote.in6) < addrsize )
|
|
addrsize = sizeof(remote.in6);
|
|
}
|
|
else
|
|
return errno = EAFNOSUPPORT, -1;
|
|
if ( !ctx->copy_to_dest(addr, &remote, addrsize) )
|
|
return -1;
|
|
if ( !ctx->copy_to_dest(addrsize_ptr, &addrsize, sizeof(addrsize)) )
|
|
return -1;
|
|
return 0;
|
|
}
|
|
|
|
int TCPSocket::getsockname(ioctx_t* ctx, uint8_t* addr, size_t* addrsize_ptr)
|
|
{
|
|
ScopedLock lock(&tcp_lock);
|
|
size_t addrsize;
|
|
if ( !ctx->copy_from_src(&addrsize, addrsize_ptr, sizeof(addrsize)) )
|
|
return -1;
|
|
if ( af == AF_INET )
|
|
{
|
|
if ( sizeof(local.in) < addrsize )
|
|
addrsize = sizeof(local.in);
|
|
}
|
|
else if ( af == AF_INET6 )
|
|
{
|
|
if ( sizeof(local.in6) < addrsize )
|
|
addrsize = sizeof(local.in6);
|
|
}
|
|
else
|
|
return errno = EAFNOSUPPORT, -1;
|
|
if ( !ctx->copy_to_dest(addr, &local, addrsize) )
|
|
return -1;
|
|
if ( !ctx->copy_to_dest(addrsize_ptr, &addrsize, sizeof(addrsize)) )
|
|
return -1;
|
|
return 0;
|
|
}
|
|
|
|
// TODO: os-test fstat on a socket.
|
|
TCPSocketNode::TCPSocketNode(TCPSocket* socket)
|
|
{
|
|
this->socket = socket;
|
|
socket->is_referenced = true;
|
|
Process* process = CurrentProcess();
|
|
inode_type = INODE_TYPE_STREAM;
|
|
dev = (dev_t) this;
|
|
ino = (ino_t) this;
|
|
type = S_IFSOCK;
|
|
kthread_mutex_lock(&process->idlock);
|
|
stat_uid = process->uid;
|
|
stat_gid = process->gid;
|
|
kthread_mutex_unlock(&process->idlock);
|
|
stat_mode = 0600 | this->type;
|
|
}
|
|
|
|
TCPSocketNode::~TCPSocketNode()
|
|
{
|
|
socket->Unreference();
|
|
}
|
|
|
|
Ref<Inode> TCPSocketNode::accept4(ioctx_t* ctx, uint8_t* addr, size_t* addrsize,
|
|
int flags)
|
|
{
|
|
return socket->accept4(ctx, addr, addrsize, flags);
|
|
}
|
|
|
|
int TCPSocketNode::bind(ioctx_t* ctx, const uint8_t* addr, size_t addrsize)
|
|
{
|
|
return socket->bind(ctx, addr, addrsize);
|
|
}
|
|
|
|
int TCPSocketNode::connect(ioctx_t* ctx, const uint8_t* addr, size_t addrsize)
|
|
{
|
|
return socket->connect(ctx, addr, addrsize);
|
|
}
|
|
|
|
int TCPSocketNode::listen(ioctx_t* ctx, int backlog)
|
|
{
|
|
return socket->listen(ctx, backlog);
|
|
}
|
|
|
|
ssize_t TCPSocketNode::recv(ioctx_t* ctx, uint8_t* buf, size_t count, int flags)
|
|
{
|
|
return socket->recv(ctx, buf, count, flags);
|
|
}
|
|
|
|
ssize_t TCPSocketNode::recvmsg(ioctx_t* ctx, struct msghdr* msg, int flags)
|
|
{
|
|
return socket->recvmsg(ctx, msg, flags);
|
|
}
|
|
|
|
ssize_t TCPSocketNode::send(ioctx_t* ctx, const uint8_t* buf, size_t count,
|
|
int flags)
|
|
{
|
|
return socket->send(ctx, buf, count, flags);
|
|
}
|
|
|
|
ssize_t TCPSocketNode::sendmsg(ioctx_t* ctx, const struct msghdr* msg,
|
|
int flags)
|
|
{
|
|
return socket->sendmsg(ctx, msg, flags);
|
|
}
|
|
|
|
ssize_t TCPSocketNode::read(ioctx_t* ctx, uint8_t* buf, size_t count)
|
|
{
|
|
return socket->read(ctx, buf, count);
|
|
}
|
|
|
|
ssize_t TCPSocketNode::write(ioctx_t* ctx, const uint8_t* buf, size_t count)
|
|
{
|
|
return socket->write(ctx, buf, count);
|
|
}
|
|
|
|
int TCPSocketNode::poll(ioctx_t* ctx, PollNode* node)
|
|
{
|
|
return socket->poll(ctx, node);
|
|
}
|
|
|
|
int TCPSocketNode::getsockopt(ioctx_t* ctx, int level, int option_name,
|
|
void* option_value, size_t* option_size_ptr)
|
|
{
|
|
return socket->getsockopt(ctx, level, option_name, option_value,
|
|
option_size_ptr);
|
|
}
|
|
|
|
int TCPSocketNode::setsockopt(ioctx_t* ctx, int level, int option_name,
|
|
const void* option_value, size_t option_size)
|
|
{
|
|
return socket->setsockopt(ctx, level, option_name, option_value,
|
|
option_size);
|
|
}
|
|
|
|
int TCPSocketNode::shutdown(ioctx_t* ctx, int how)
|
|
{
|
|
return socket->shutdown(ctx, how);
|
|
}
|
|
|
|
int TCPSocketNode::getpeername(ioctx_t* ctx, uint8_t* addr, size_t* addrsize)
|
|
{
|
|
return socket->getpeername(ctx, addr, addrsize);
|
|
}
|
|
|
|
int TCPSocketNode::getsockname(ioctx_t* ctx, uint8_t* addr, size_t* addrsize)
|
|
{
|
|
return socket->getsockname(ctx, addr, addrsize);
|
|
}
|
|
|
|
void HandleIP(Ref<Packet> pkt,
|
|
const struct in_addr* src,
|
|
const struct in_addr* dst,
|
|
bool dst_broadcast)
|
|
{
|
|
if ( src->s_addr == htobe32(INADDR_ANY) )
|
|
return;
|
|
if ( dst_broadcast )
|
|
return;
|
|
const unsigned char* in = pkt->from + pkt->offset;
|
|
size_t inlen = pkt->length - pkt->offset;
|
|
struct tcphdr hdr;
|
|
if ( inlen < sizeof(hdr) )
|
|
return;
|
|
if ( UINT16_MAX < inlen )
|
|
return;
|
|
memcpy(&hdr, in, sizeof(hdr));
|
|
hdr.th_sport = be16toh(hdr.th_sport);
|
|
hdr.th_dport = be16toh(hdr.th_dport);
|
|
hdr.th_sum = be16toh(hdr.th_sum);
|
|
uint16_t sum = 0;
|
|
sum = IP::ipsum_buf(sum, src, sizeof(struct in_addr));
|
|
sum = IP::ipsum_buf(sum, dst, sizeof(struct in_addr));
|
|
sum = IP::ipsum_word(sum, IPPROTO_TCP);
|
|
sum = IP::ipsum_word(sum, inlen);
|
|
sum = IP::ipsum_buf(sum, in, inlen);
|
|
if ( sum != 0 && sum != 0xFFFF )
|
|
return;
|
|
if ( TCP_OFFSET_DECODE(hdr.th_offset) < sizeof(hdr) / 4 ||
|
|
inlen < (size_t) TCP_OFFSET_DECODE(hdr.th_offset) * 4 )
|
|
return;
|
|
// Port 0 is not valid.
|
|
if ( hdr.th_sport == 0 || hdr.th_dport == 0 )
|
|
return;
|
|
// TODO: TCP options. Respect TCPOPT_MAXSEG.
|
|
TCPSocket* socket = NULL;
|
|
TCPSocket* socket_listener = NULL;
|
|
TCPSocket* any_socket_listener = NULL;
|
|
ScopedLock lock(&tcp_lock);
|
|
for ( TCPSocket* iter = bindings_v4[hdr.th_dport];
|
|
!socket && iter;
|
|
iter = iter->next_socket )
|
|
{
|
|
// TODO: If a TCP socket is bound, and then connected to, what happens?
|
|
// What if the TCP socket then connects to the other side?
|
|
if ( !iter->remoted )
|
|
continue;
|
|
// The datagram was sent to the socket's local address.
|
|
if ( !memcmp(&iter->local.in.sin_addr, dst, sizeof(*dst)) )
|
|
{
|
|
// The first priority is to receive on a socket with the correct
|
|
// local address and the correct remote address.
|
|
if ( !memcmp(&iter->remote.in.sin_addr, src, sizeof(*src)) &&
|
|
be16toh(iter->remote.in.sin_port) == hdr.th_sport )
|
|
socket = iter;
|
|
// The second priority is to receive on a socket with the correct
|
|
// local address and listening for connections from any address.
|
|
else if ( iter->remote.in.sin_addr.s_addr == htobe32(INADDR_ANY) )
|
|
socket_listener = iter;
|
|
}
|
|
// The socket is bound to the any address.
|
|
if ( iter->local.in.sin_addr.s_addr == htobe32(INADDR_ANY) )
|
|
{
|
|
// The third priority is to receive on a socket bound to the any
|
|
// address and listening for connections from any address.
|
|
if ( iter->remote.in.sin_addr.s_addr == htobe32(INADDR_ANY) )
|
|
any_socket_listener = iter;
|
|
}
|
|
}
|
|
if ( !socket )
|
|
socket = socket_listener;
|
|
if ( !socket )
|
|
socket = any_socket_listener;
|
|
// No socket wanted to receive the packet.
|
|
if ( !socket )
|
|
{
|
|
// TODO: Send RST.
|
|
return;
|
|
}
|
|
// If the socket is bound to a network interface, require the packet to
|
|
// have been received on that network interface.
|
|
if ( socket->ifindex && socket->ifindex != pkt->netif->ifinfo.linkid )
|
|
{
|
|
// TODO: Send RST.
|
|
return;
|
|
}
|
|
union tcp_sockaddr pkt_src;
|
|
pkt_src.in.sin_family = AF_INET;
|
|
pkt_src.in.sin_addr = *src;
|
|
pkt_src.in.sin_port = htobe16(hdr.th_sport);
|
|
union tcp_sockaddr pkt_dst;
|
|
pkt_dst.in.sin_family = AF_INET;
|
|
pkt_dst.in.sin_addr = *dst;
|
|
pkt_dst.in.sin_port = htobe16(hdr.th_dport);
|
|
// Receive the packet on the socket.
|
|
socket->ReceivePacket(pkt, &pkt_src, &pkt_dst);
|
|
// Delete the socket if needed or schedule a transmit if needed.
|
|
if ( socket->can_destroy() )
|
|
delete socket;
|
|
}
|
|
|
|
Ref<Inode> Socket(int af)
|
|
{
|
|
if ( !IsSupportedAddressFamily(af) )
|
|
return errno = EAFNOSUPPORT, Ref<Inode>(NULL);
|
|
TCPSocket* socket = new TCPSocket(af);
|
|
if ( !socket )
|
|
return Ref<Inode>();
|
|
Ref<TCPSocketNode> result(new TCPSocketNode(socket));
|
|
if ( !result )
|
|
return delete socket, Ref<Inode>();
|
|
return result;
|
|
}
|
|
|
|
} // namespace TCP
|
|
} // namespace Sortix
|