[prev in list] [next in list] [prev in thread] [next in thread]
List: tcpdump-workers
Subject: Re: [tcpdump-workers] Libpcap reentrancy and PF_RING patch
From: Guy Harris <guy () alum ! mit ! edu>
Date: 2008-01-24 21:08:44
Message-ID: 4798FE5C.7090308 () alum ! mit ! edu
[Download RAW message or body]
Luca Deri wrote:
> I have considered your suggestion to move pfring into a pcap-pfring.*
> file.
I didn't make such a suggestion.
I did say
>> Would it work better if, for PF_RING sockets, there were a separate
>> pcap_read_pf_ring, and handle->read_op were set to pcap_read_pf_ring if
>> a PF_RING socket were being used? That'd avoid some per-packet checks in
>> the read_op routine, and might involve fewer #ifdefs as well.
>
> For that matter, should there be a separate live_open_pf_ring()
> routine, with that routine called first and, if it fails (e.g., because
> the system doesn't have PF_RING support), live_open_new() called?
but "make separate routines" doesn't require the separate routines to be
in a separate file - see, for example, the current top-of-tree
pcap-linux.c, where there are some separate routines for accessing the
ring bufffer.
Here's a patch (unified diff) to the current top-of-tree pcap-linux.c
that implements that scheme. It also cleans up some things that looked
as if they were problems:
as PF_RING sockets don't support a mechanism similar to PF_PACKET
sockets' PACKET_ADD_MEMBERSHIP mechanism for enabling promiscuous mode
(as well as "all-multicast" mode and adding particular multicast
addresses), which automatically cancels the membership (thus turning
promiscuous mode off) when the socket is closed, for PF_RING sockets, as
with SOCK_PACKET sockets, we have to add a PF_RING pcap_t to the list of
sockets to be closed on exit, so that if an app exits without explicitly
closing a pcap_t, the pcap_t is still closed, and promiscuous mode is
turned off (that doesn't handle exiting due to a signal, of course);
pfring_open() doesn't appear to handle NULL or "any" as the device
argument, so we don't use PF_RING in that case;
it also doesn't use SOCK_DGRAM sockets, so, in cases where we fall back
on cooked mode, we don't use PF_RING.
["patch" (text/plain)]
Index: pcap-linux.c
===================================================================
RCS file: /tcpdump/master/libpcap/pcap-linux.c,v
retrieving revision 1.134
diff -u -r1.134 pcap-linux.c
--- pcap-linux.c 24 Jan 2008 20:20:08 -0000 1.134
+++ pcap-linux.c 24 Jan 2008 20:51:58 -0000
@@ -239,6 +239,13 @@
static int pcap_getnonblock_mmap(pcap_t *p, char *errbuf);
#endif
+#ifdef HAVE_PF_RING
+static int live_open_pf_ring(pcap_t *, const char *, int, int, char *);
+static void pcap_close_linux_pf_ring(pcap_t *);
+static int pcap_read_linux_pf_ring(pcap_t *, int, pcap_handler , u_char *);
+static int pcap_stats_linux_pf_ring(pcap_t *, struct pcap_stat *);
+#endif
+
/*
* Wrap some ioctl calls
*/
@@ -355,6 +362,30 @@
return NULL;
}
+#ifdef HAVE_PF_RING
+ /*
+ * Try to use the PF_RING socket mechanism first.
+ */
+ err = live_open_pf_ring(handle, device, promisc, to_ms, ebuf);
+ if (err == 1) {
+ /*
+ * That succeeded.
+ */
+ live_open_ok = 1; /* succeeded */
+
+ /*
+ * Override certain operations.
+ */
+ handle->close_op = pcap_close_linux_pf_ring;
+ handle->read_op = pcap_read_linux_pf_ring;
+ handle->stats_op = pcap_stats_linux_pf_ring;
+ } else if (err == 0) {
+ /*
+ * That failed, but not fatally - try using the other
+ * mechanisms.
+ */
+#endif
+
/*
* Current Linux kernels use the protocol family PF_PACKET to
* allow direct access to all packets on the network while
@@ -375,11 +406,14 @@
if (live_open_old(handle, device, promisc, to_ms, ebuf))
live_open_ok = 1;
}
+#ifdef HAVE_PF_RING
+ }
+#endif
if (!live_open_ok) {
/*
- * Both methods to open the packet socket failed. Tidy
- * up and report our failure (ebuf is expected to be
- * set by the functions above).
+ * All methods to open the device for capturing failed.
+ * Tidy up and report our failure (ebuf is expected to
+ * be set by the functions above).
*/
if (handle->md.device != NULL)
@@ -2502,6 +2536,19 @@
int ret;
int save_errno;
+#ifdef HAVE_PF_RING
+ if (handle->ring) {
+ /*
+ * For PF_RING sockets, we don't do the flushing
+ * stuff.
+ * XXX - do we need to do so?
+ * XXX - should this be SOL_SOCKET or 0?
+ */
+ return setsockopt(handle->fd, SOL_SOCKET, SO_ATTACH_FILTER,
+ fcode, sizeof(*fcode));
+ }
+#endif
+
/*
* The socket filter code doesn't discard all packets queued
* up on the socket when the filter is changed; this means
@@ -2611,3 +2658,364 @@
&dummy, sizeof(dummy));
}
#endif
+
+#ifdef HAVE_PF_RING
+
+/* ===== Functions to interface to the PF_RING mechanism ================== */
+
+/*
+ * Try to open a packet socket using the PF_RING mechanism.
+ * Returns 1 on success, 0 on failure.
+ */
+static int
+live_open_pf_ring(pcap_t *handle, const char *device, int promisc,
+ int to_ms, char *ebuf)
+{
+ int sock_fd, arptype;
+
+ /*
+ * The PF_RING library doesn't support a PF_RING socket not
+ * bound to a device (it assumes the "device" argument
+ * is non-null).
+ */
+ if (device == NULL)
+ return 0;
+
+ handle->ring = pfring_open((char*)device, promisc, 1);
+ if (handle->ring == NULL)
+ return 0;
+ sock_fd = handle->fd = handle->ring->fd;
+ handle->bufsize = handle->snapshot;
+
+ /* It seems the kernel supports the new interface. */
+ handle->md.sock_packet = 0;
+
+ /*
+ * Get the interface index of the loopback device.
+ * If the attempt fails, don't fail, just set the
+ * "md.lo_ifindex" to -1.
+ *
+ * XXX - can there be more than one device that loops
+ * packets back, i.e. devices other than "lo"? If so,
+ * we'd need to find them all, and have an array of
+ * indices for them, and check all of them in
+ * "pcap_read_packet()".
+ */
+ handle->md.lo_ifindex = iface_get_id(sock_fd, "lo", ebuf);
+
+ /*
+ * Default value for offset to align link-layer payload
+ * on a 4-byte boundary.
+ */
+ handle->offset = 0;
+
+ /*
+ * What kind of frames do we have to deal with? Fail,
+ * but not fatally, if we have an unknown interface type,
+ * so that we'll try a PF_PACKET socket.
+ */
+ /* Assume for now we don't need cooked mode. */
+ handle->md.cooked = 0;
+
+ arptype = iface_get_arptype(sock_fd, device, ebuf);
+ if (arptype == -1) {
+ /*
+ * Shut down the ring.
+ */
+ pfring_close(handle->ring);
+
+ /*
+ * Get rid of any link-layer type list we allocated.
+ */
+ if (handle->dlt_list != NULL)
+ free(handle->dlt_list);
+ /*
+ * This is a fatal error; we won't try using
+ * PF_PACKET sockets, as they'll presumably
+ * get the same error from iface_get_arptype().
+ */
+ return -2;
+ }
+ map_arphrd_to_dlt(handle, arptype, 1);
+ if (handle->linktype == -1 ||
+ handle->linktype == DLT_LINUX_SLL ||
+ handle->linktype == DLT_LINUX_IRDA ||
+ handle->linktype == DLT_LINUX_LAPD ||
+ (handle->linktype == DLT_EN10MB &&
+ (strncmp("isdn", device, 4) == 0 ||
+ strncmp("isdY", device, 4) == 0))) {
+ /*
+ * Unknown interface type (-1), or a
+ * device we explicitly chose to run
+ * in cooked mode (e.g., PPP devices),
+ * or an ISDN device (whose link-layer
+ * type we can only determine by using
+ * APIs that may be different on different
+ * kernels) - fail, as PF_RING sockets only
+ * support SOCK_RAW, not SOCK_DGRAM, so
+ * there's no cooked mode.
+ */
+ /*
+ * Get rid of any link-layer type list
+ * we allocated - this only supports cooked
+ * capture.
+ */
+ if (handle->dlt_list != NULL) {
+ free(handle->dlt_list);
+ handle->dlt_list = NULL;
+ handle->dlt_count = 0;
+ }
+ pfring_close(handle->ring);
+ handle->ring = NULL;
+ return 0;
+ }
+
+ /*
+ * PF_RING uses the old SIOCSIFFLAGS-based mechanism for turning
+ * promiscuous mode on and off, so, just as we have to do when
+ * using SOCK_PACKET sockets, we have to add this to the list
+ * of pcaps to close when we exit, so promiscuous mode gets
+ * turned off even if the application exits without explicitly
+ * closing the pcap_t.
+ */
+ handle->md.next = pcaps_to_close;
+ pcaps_to_close = handle;
+
+ return 1;
+}
+
+static void
+pcap_close_linux_pf_ring(pcap_t *handle)
+{
+ pfring_close(handle->ring);
+
+ /*
+ * pfring_close() already closed the file descriptor, so set
+ * handle->fd to -1 so pcap_close_common() doesn't close it.
+ */
+ handle->fd = -1;
+ pcap_close_linux(handle);
+}
+
+/*
+ * Read a packet from the socket calling the handler provided by
+ * the user. Returns the number of packets received or -1 if an
+ * error occured.
+ */
+static int
+pcap_read_linux_pf_ring(pcap_t *handle, pcap_handler callback, u_char *userdata)
+{
+ u_char *bp;
+ int packet_len, caplen;
+ struct pfring_pkthdr pcap_header;
+
+ for (;;) {
+ if (handle->break_loop) {
+ /*
+ * Yes - clear the flag that indicates that it
+ * has, and return -2 as an indication that we
+ * were told to break out of the loop.
+ *
+ * Patch courtesy of Michael Stiller <ms@2scale.net>
+ */
+ handle->break_loop = 0;
+ return -2;
+ }
+
+ packet_len = pfring_recv(handle->ring, (char*)handle->buffer,
+ handle->bufsize,
+ &pcap_header,
+ 1 /* wait_for_incoming_packet */);
+ if (packet_len > 0) {
+ bp = handle->buffer;
+ pcap_header.caplen = min(pcap_header.caplen, handle->bufsize);
+ caplen = pcap_header.caplen, packet_len = pcap_header.len;
+ break;
+ } else if (packet_len == -1 && errno == EINTR)
+ continue;
+ else
+ return -1;
+ }
+
+ /*
+ * XXX: According to the kernel source we should get the real
+ * packet len if calling recvfrom with MSG_TRUNC set. It does
+ * not seem to work here :(, but it is supported by this code
+ * anyway.
+ * To be honest the code RELIES on that feature so this is really
+ * broken with 2.2.x kernels.
+ * I spend a day to figure out what's going on and I found out
+ * that the following is happening:
+ *
+ * The packet comes from a random interface and the packet_rcv
+ * hook is called with a clone of the packet. That code inserts
+ * the packet into the receive queue of the packet socket.
+ * If a filter is attached to that socket that filter is run
+ * first - and there lies the problem. The default filter always
+ * cuts the packet at the snaplen:
+ *
+ * # tcpdump -d
+ * (000) ret #68
+ *
+ * So the packet filter cuts down the packet. The recvfrom call
+ * says "hey, it's only 68 bytes, it fits into the buffer" with
+ * the result that we don't get the real packet length. This
+ * is valid at least until kernel 2.2.17pre6.
+ *
+ * We currently handle this by making a copy of the filter
+ * program, fixing all "ret" instructions with non-zero
+ * operands to have an operand of 65535 so that the filter
+ * doesn't truncate the packet, and supplying that modified
+ * filter to the kernel.
+ *
+ * XXX - does any of that apply for PF_RING?
+ */
+ caplen = packet_len;
+ if (caplen > handle->snapshot)
+ caplen = handle->snapshot;
+
+ /* Run the packet filter if not using kernel filter */
+ if (!handle->md.use_bpf && handle->fcode.bf_insns) {
+ if (bpf_filter(handle->fcode.bf_insns, bp,
+ packet_len, caplen) == 0)
+ {
+ /* rejected by filter */
+ return 0;
+ }
+ }
+
+ /*
+ * Count the packet.
+ *
+ * Arguably, we should count them before we check the filter,
+ * as on many other platforms "ps_recv" counts packets
+ * handed to the filter rather than packets that passed
+ * the filter, but if filtering is done in the kernel, we
+ * can't get a count of packets that passed the filter,
+ * and that would mean the meaning of "ps_recv" wouldn't
+ * be the same on all Linux systems.
+ *
+ * XXX - it's not the same on all systems in any case;
+ * ideally, we should have a "get the statistics" call
+ * that supplies more counts and indicates which of them
+ * it supplies, so that we supply a count of packets
+ * handed to the filter only on platforms where that
+ * information is available.
+ *
+ * We count them here even if we can get the packet count
+ * from the kernel, as we can only determine at run time
+ * whether we'll be able to get it from the kernel (if
+ * HAVE_TPACKET_STATS isn't defined, we can't get it from
+ * the kernel, but if it is defined, the library might
+ * have been built with a 2.4 or later kernel, but we
+ * might be running on a 2.2[.x] kernel without Alexey
+ * Kuznetzov's turbopacket patches, and thus the kernel
+ * might not be able to supply those statistics). We
+ * could, I guess, try, when opening the socket, to get
+ * the statistics, and if we can not increment the count
+ * here, but it's not clear that always incrementing
+ * the count is more expensive than always testing a flag
+ * in memory.
+ *
+ * We keep the count in "md.packets_read", and use that for
+ * "ps_recv" if we can't get the statistics from the kernel.
+ * We do that because, if we *can* get the statistics from
+ * the kernel, we use "md.stat.ps_recv" and "md.stat.ps_drop"
+ * as running counts, as reading the statistics from the
+ * kernel resets the kernel statistics, and if we directly
+ * increment "md.stat.ps_recv" here, that means it will
+ * count packets *twice* on systems where we can get kernel
+ * statistics - once here, and once in pcap_stats_linux().
+ */
+ handle->md.packets_read++;
+
+ /* Call the user supplied callback function */
+ callback(userdata, (struct pcap_pkthdr*)&pcap_header, bp);
+ return 1;
+}
+
+/*
+ * Get the statistics for the given packet capture handle.
+ */
+static int
+pcap_stats_linux_pf_ring(pcap_t *handle, struct pcap_stat *stats)
+{
+ struct tpacket_stats kstats;
+ socklen_t len = sizeof (struct tpacket_stats);
+
+ /*
+ * Try to get the packet counts from the kernel.
+ */
+ if (getsockopt(handle->fd, SOL_PACKET, PACKET_STATISTICS,
+ &kstats, &len) > -1) {
+ /*
+ * On systems where the PACKET_STATISTICS "getsockopt()"
+ * argument is supported on PF_PACKET sockets:
+ *
+ * "ps_recv" counts only packets that *passed* the
+ * filter, not packets that didn't pass the filter.
+ * This includes packets later dropped because we
+ * ran out of buffer space.
+ *
+ * "ps_drop" counts packets dropped because we ran
+ * out of buffer space. It doesn't count packets
+ * dropped by the interface driver. It counts only
+ * packets that passed the filter.
+ *
+ * Both statistics include packets not yet read from
+ * the kernel by libpcap, and thus not yet seen by
+ * the application.
+ *
+ * In "linux/net/packet/af_packet.c", at least in the
+ * 2.4.9 kernel, "tp_packets" is incremented for every
+ * packet that passes the packet filter *and* is
+ * successfully queued on the socket; "tp_drops" is
+ * incremented for every packet dropped because there's
+ * not enough free space in the socket buffer.
+ *
+ * When the statistics are returned for a PACKET_STATISTICS
+ * "getsockopt()" call, "tp_drops" is added to "tp_packets",
+ * so that "tp_packets" counts all packets handed to
+ * the PF_PACKET socket, including packets dropped because
+ * there wasn't room on the socket buffer - but not
+ * including packets that didn't pass the filter.
+ *
+ * In the BSD BPF, the count of received packets is
+ * incremented for every packet handed to BPF, regardless
+ * of whether it passed the filter.
+ *
+ * We can't make "pcap_stats()" work the same on both
+ * platforms, but the best approximation is to return
+ * "tp_packets" as the count of packets and "tp_drops"
+ * as the count of drops.
+ */
+ handle->md.stat.ps_recv = kstats.tp_packets;
+ handle->md.stat.ps_drop = kstats.tp_drops;
+ *stats = handle->md.stat;
+ return 0;
+ }
+ else
+ {
+ snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
+ "pcap_stats: %s", pcap_strerror(errno));
+ return -1;
+ }
+}
+
+int pcap_set_cluster(pfring *ring, u_int clusterId) {
+ return(pfring_set_cluster(ring, clusterId));
+}
+
+int pcap_remove_from_cluster(pfring *ring) {
+ return(pfring_remove_from_cluster(ring));
+}
+
+int pcap_set_reflector(pfring *ring,
+ char *reflectorDevice) {
+ return(pfring_set_reflector(ring, reflectorDevice));
+}
+
+pfring* pcap_get_pfring_handle(const pcap_t *pHandle) {
+ return(pHandle ? pHandle->ring : NULL);
+}
+#endif /* HAVE_PF_RING */
-
This is the tcpdump-workers list.
Visit https://cod.sandelman.ca/ to unsubscribe.
[prev in list] [next in list] [prev in thread] [next in thread]
Configure |
About |
News |
Add a list |
Sponsored by KoreLogic