diff --git a/Cargo.toml b/Cargo.toml index e66f44a67..4b78cf4e7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -96,6 +96,9 @@ defmt = ["dep:defmt", "heapless/defmt"] "packetmeta-id" = [] +# Enables segmentation offload support. +"segmentation-offload" = [] + "async" = [] # Automatically reply on an ICMP echo request diff --git a/src/iface/interface/mod.rs b/src/iface/interface/mod.rs index 5b0bbcda5..b70d978b6 100644 --- a/src/iface/interface/mod.rs +++ b/src/iface/interface/mod.rs @@ -770,13 +770,11 @@ impl Interface { }) } #[cfg(feature = "socket-tcp")] - Socket::Tcp(socket) => socket.dispatch(&mut self.inner, |inner, (ip, tcp)| { - respond( - inner, - PacketMeta::default(), - Packet::new(ip, IpPayload::Tcp(tcp)), - ) - }), + Socket::Tcp(socket) => { + socket.dispatch(&mut self.inner, |inner, meta, (ip, tcp)| { + respond(inner, meta, Packet::new(ip, IpPayload::Tcp(tcp))) + }) + } #[cfg(feature = "socket-dhcpv4")] Socket::Dhcpv4(socket) => { socket.dispatch(&mut self.inner, |inner, (ip, udp, dhcp)| { @@ -833,6 +831,12 @@ impl InterfaceInner { self.caps.checksum.clone() } + #[cfg(feature = "segmentation-offload")] + #[allow(unused)] // unused depending on which sockets are enabled + pub(crate) fn segmentation_caps(&self) -> crate::phy::SegmentationCapabilities { + self.caps.segmentation.clone() + } + #[allow(unused)] // unused depending on which sockets are enabled pub(crate) fn ip_mtu(&self) -> usize { self.caps.ip_mtu() @@ -1273,7 +1277,15 @@ impl InterfaceInner { #[cfg(feature = "proto-ipv4")] IpRepr::Ipv4(repr) => { // If we have an IPv4 packet, then we need to check if we need to fragment it. - if total_ip_len > self.caps.ip_mtu() { + let should_fragment = total_ip_len > self.caps.ip_mtu(); + + // If the second condition is false (i.e. the metadata includes a target segment + // size), the packet will be segmented by the device and fragmentation on our side + // is not necessary. + #[cfg(feature = "segmentation-offload")] + let should_fragment = should_fragment && meta.segmentation_offload_size.is_none(); + + if should_fragment { #[cfg(feature = "proto-ipv4-fragmentation")] { net_debug!("start fragmentation"); diff --git a/src/phy/mod.rs b/src/phy/mod.rs index 1d3de66e5..b86d5d546 100644 --- a/src/phy/mod.rs +++ b/src/phy/mod.rs @@ -89,6 +89,8 @@ impl<'a> phy::TxToken for StmPhyTxToken<'a> { )] use crate::time::Instant; +#[cfg(feature = "segmentation-offload")] +use core::num::{NonZeroU16, NonZeroUsize}; #[cfg(all( any(feature = "phy-raw_socket", feature = "phy-tuntap_interface"), @@ -147,7 +149,7 @@ pub const IPV4_FRAGMENT_PAYLOAD_ALIGNMENT: usize = 8; /// struct becomes zero-sized, which allows the compiler to optimize it out as if /// the packet metadata mechanism didn't exist at all. /// -/// Currently only UDP sockets allow setting/retrieving packet metadata. The metadata +/// Currently only TCP and UDP sockets allow setting/retrieving packet metadata. The metadata /// for packets emitted with other sockets will be all default values. /// /// This struct is marked as `#[non_exhaustive]`. This means it is not possible to @@ -168,6 +170,8 @@ pub const IPV4_FRAGMENT_PAYLOAD_ALIGNMENT: usize = 8; pub struct PacketMeta { #[cfg(feature = "packetmeta-id")] pub id: u32, + #[cfg(feature = "segmentation-offload")] + pub segmentation_offload_size: Option, } /// A description of checksum behavior for a particular protocol. @@ -233,6 +237,28 @@ impl ChecksumCapabilities { } } +/// The maximum buffer size for a particular protocol or protocol pair that +/// can be offloaded to the device for segmentation, or [None] if segmentation +/// offload is not supported. +/// +/// For Ethernet devices, this includes the Ethernet header (14 octets), but +/// *not* the Ethernet FCS (4 octets). +/// +/// If the device supports unsegmented IP packets with (depending on the IP +/// version, total or payload) lengths greater than [u16::MAX], it should not +/// rely on the length field in the IP header, as the actual length cannot be +/// represented there. The value will be 0 instead. +#[derive(Debug, Clone, Default)] +#[cfg_attr(feature = "defmt", derive(defmt::Format))] +#[non_exhaustive] +#[cfg(feature = "segmentation-offload")] +pub struct SegmentationCapabilities { + #[cfg(all(feature = "socket-tcp", feature = "proto-ipv4"))] + pub tcpv4: Option, + #[cfg(all(feature = "socket-tcp", feature = "proto-ipv6"))] + pub tcpv6: Option, +} + /// A description of device capabilities. /// /// Higher-level protocols may achieve higher throughput or lower latency if they consider @@ -276,6 +302,13 @@ pub struct DeviceCapabilities { /// If the network device is capable of verifying or computing checksums for some protocols, /// it can request that the stack not do so in software to improve performance. pub checksum: ChecksumCapabilities, + + #[cfg(feature = "segmentation-offload")] + /// Segmentation offload capabilities. + /// + /// If the network device is capable of segmenting packets for some protocols, + /// it can request that the stack not do so in software to improve performance. + pub segmentation: SegmentationCapabilities, } impl DeviceCapabilities { diff --git a/src/phy/pcap_writer.rs b/src/phy/pcap_writer.rs index a1014fc36..a7c5be282 100644 --- a/src/phy/pcap_writer.rs +++ b/src/phy/pcap_writer.rs @@ -62,7 +62,7 @@ pub trait PcapSink { self.write_u16(4); // minor version self.write_u32(0); // timezone (= UTC) self.write_u32(0); // accuracy (not used) - self.write_u32(65535); // maximum packet length + self.write_u32(self.max_packet_size()); // maximum packet length self.write_u32(link_type.into()); // link-layer header type } @@ -71,24 +71,41 @@ pub trait PcapSink { /// See also the note for [global_header](#method.global_header). /// /// # Panics - /// This function panics if `length` is greater than 65535. + /// This function panics if `length` is greater than [u32::MAX]. fn packet_header(&mut self, timestamp: Instant, length: usize) { - assert!(length <= 65535); + let original_length = length.try_into().unwrap(); self.write_u32(timestamp.secs() as u32); // timestamp seconds self.write_u32(timestamp.micros() as u32); // timestamp microseconds - self.write_u32(length as u32); // captured length - self.write_u32(length as u32); // original length + self.write_u32(self.max_packet_size().min(original_length)); // captured length + self.write_u32(original_length); } /// Write the libpcap packet header followed by packet data into the sink. /// + /// The default implementation truncates packets that are larger than [Self::max_packet_size]. + /// /// See also the note for [global_header](#method.global_header). fn packet(&mut self, timestamp: Instant, packet: &[u8]) { - self.packet_header(timestamp, packet.len()); - self.write(packet); + let packet_len = packet.len(); + let max_packet_size = usize::try_from(self.max_packet_size()).unwrap(); + + self.packet_header(timestamp, packet_len); + self.write(&packet[..max_packet_size.min(packet_len)]); self.flush(); } + + /// Return the maximum size for captured packets. + /// + /// The captures of packets larger than this size will be truncated by default. Excessively + /// large values may cause the software reading the captures to allocate unnecessarily large + /// buffers. + fn max_packet_size(&self) -> u32 { + // Use the default value used by [libpcap] and [Wireshark]. + // [Wireshark]: https://gitlab.com/wireshark/wireshark/-/blob/v3.5.0/wiretap/wtap.h#L334 + // [libpcap]: https://github.com/the-tcpdump-group/libpcap/blob/libpcap-1.6.0-bp/pcap-int.h#L106 + 262144 + } } #[cfg(feature = "std")] diff --git a/src/socket/tcp.rs b/src/socket/tcp.rs index 3cc436da6..ba569b67e 100644 --- a/src/socket/tcp.rs +++ b/src/socket/tcp.rs @@ -7,6 +7,7 @@ use core::fmt::Display; use core::task::Waker; use core::{fmt, mem}; +use crate::phy::PacketMeta; #[cfg(feature = "async")] use crate::socket::WakerRegistration; use crate::socket::{Context, PollAt}; @@ -2351,7 +2352,7 @@ impl<'a> Socket<'a> { pub(crate) fn dispatch(&mut self, cx: &mut Context, emit: F) -> Result<(), E> where - F: FnOnce(&mut Context, (IpRepr, TcpRepr)) -> Result<(), E>, + F: FnOnce(&mut Context, PacketMeta, (IpRepr, TcpRepr)) -> Result<(), E>, { if self.tuple.is_none() { return Ok(()); @@ -2478,6 +2479,15 @@ impl<'a> Socket<'a> { let mut is_zero_window_probe = false; + #[cfg_attr( + not(feature = "segmentation-offload"), + expect( + unused_mut, + reason = "The default is not mutated if the segmentation offload feature is not enabled." + ) + )] + let mut packet_meta = PacketMeta::default(); + match self.state { // We transmit an RST in the CLOSED state. If we ended up in the CLOSED state // with a specified endpoint, it means that the socket was aborted. @@ -2536,17 +2546,54 @@ impl<'a> Socket<'a> { is_zero_window_probe = true; } - // Maximum size we're allowed to send. This can be limited by 3 factors: + // Maximum size we're allowed to send can be limited by 3 factors: // 1. remote window // 2. MSS the remote is willing to accept, probably determined by their MTU // 3. MSS we can send, determined by our MTU. - let size = win_limit - .min(self.remote_mss) + // + // If the device supports its offload, segmentation that is needed + // to comply with the latter two will be handled by the device based on the + // metadata we provide. + + let segment_size = self + .remote_mss .min(cx.ip_mtu() - ip_repr.header_len() - TCP_HEADER_LEN); + #[cfg(not(feature = "segmentation-offload"))] + let device_limit = segment_size; + + #[cfg(feature = "segmentation-offload")] + let device_limit = { + let segmentation_caps = cx.segmentation_caps(); + match ip_repr.version() { + #[cfg(feature = "proto-ipv4")] + crate::wire::IpVersion::Ipv4 => segmentation_caps.tcpv4, + #[cfg(feature = "proto-ipv6")] + crate::wire::IpVersion::Ipv6 => segmentation_caps.tcpv6, + } + .map(|buf_size| { + #[cfg(feature = "medium-ethernet")] + let ip_mtu = buf_size.get() - crate::wire::ETHERNET_HEADER_LEN; + #[cfg(not(feature = "medium-ethernet"))] + let ip_mtu = buf_size.get(); + ip_mtu - ip_repr.header_len() - TCP_HEADER_LEN + }) + .unwrap_or(segment_size) + }; + + let size = win_limit.min(device_limit); + let offset = self.remote_last_seq - self.local_seq_no; repr.payload = self.tx_buffer.get_allocated(offset, size); + #[cfg(feature = "segmentation-offload")] + if repr.payload.len() > segment_size { + packet_meta.segmentation_offload_size = + core::num::NonZeroU16::try_from(u16::try_from(segment_size).unwrap()) + .unwrap() + .into(); + } + // If we've sent everything we had in the buffer, follow it with the PSH or FIN // flags, depending on whether the transmit half of the connection is open. if offset + repr.payload.len() == self.tx_buffer.len() { @@ -2616,7 +2663,7 @@ impl<'a> Socket<'a> { // to not waste time waiting for the retransmit timer on packets that we know // for sure will not be successfully transmitted. ip_repr.set_payload_len(repr.buffer_len()); - emit(cx, (ip_repr, repr))?; + emit(cx, packet_meta, (ip_repr, repr))?; // We've sent something, whether useful data or a keep-alive packet, so rewind // the keep-alive timer. @@ -2909,7 +2956,7 @@ mod test { let mut sent = 0; let result = socket .socket - .dispatch(&mut socket.cx, |_, (ip_repr, tcp_repr)| { + .dispatch(&mut socket.cx, |_, _, (ip_repr, tcp_repr)| { assert_eq!(ip_repr.next_header(), IpProtocol::Tcp); assert_eq!(ip_repr.src_addr(), LOCAL_ADDR.into()); assert_eq!(ip_repr.dst_addr(), REMOTE_ADDR.into()); @@ -2930,7 +2977,7 @@ mod test { socket.cx.set_now(timestamp); let mut fail = false; - let result: Result<(), ()> = socket.socket.dispatch(&mut socket.cx, |_, _| { + let result: Result<(), ()> = socket.socket.dispatch(&mut socket.cx, |_, _, _| { fail = true; Ok(()) }); @@ -7994,7 +8041,7 @@ mod test { s.set_hop_limit(Some(0x2a)); assert_eq!( - s.socket.dispatch(&mut s.cx, |_, (ip_repr, _)| { + s.socket.dispatch(&mut s.cx, |_, _, (ip_repr, _)| { assert_eq!(ip_repr.hop_limit(), 0x2a); Ok::<_, ()>(()) }), diff --git a/src/wire/ipv4.rs b/src/wire/ipv4.rs index 9342692ef..8643d12c3 100644 --- a/src/wire/ipv4.rs +++ b/src/wire/ipv4.rs @@ -590,7 +590,16 @@ impl Repr { packet.set_header_len(field::DST_ADDR.end as u8); packet.set_dscp(0); packet.set_ecn(0); + #[cfg(not(feature = "segmentation-offload"))] let total_len = packet.header_len() as u16 + self.payload_len as u16; + #[cfg(feature = "segmentation-offload")] + // If because of segmentation offload the length of the buffer exceeds what can be + // represented in the length field of the IP header, we fall back to 0. It will be + // filled by the device during segmentation anyways. + let total_len = u16::try_from(self.payload_len) + .ok() + .and_then(|payload_len: u16| payload_len.checked_add(packet.header_len() as u16)) + .unwrap_or(0); packet.set_total_len(total_len); packet.set_ident(0); packet.clear_flags(); diff --git a/src/wire/ipv6.rs b/src/wire/ipv6.rs index 868743f7c..941790151 100644 --- a/src/wire/ipv6.rs +++ b/src/wire/ipv6.rs @@ -631,7 +631,14 @@ impl Repr { packet.set_version(6); packet.set_traffic_class(0); packet.set_flow_label(0); - packet.set_payload_len(self.payload_len as u16); + #[cfg(not(feature = "segmentation-offload"))] + let payload_len = self.payload_len as u16; + #[cfg(feature = "segmentation-offload")] + // If because of segmentation offload the length of the buffer exceeds what can be + // represented in the length field of the IP header, we fall back to 0. It will be + // filled by the device during segmentation anyways. + let payload_len = u16::try_from(self.payload_len).unwrap_or(0); + packet.set_payload_len(payload_len); packet.set_hop_limit(self.hop_limit); packet.set_next_header(self.next_header); packet.set_src_addr(self.src_addr);