From: Andrew Gallatin <gallatin@myri•com>
To: Herbert Xu <herbert@gondor•apana.org.au>
Cc: David Miller <davem@davemloft•net>,
brice@myri•com, sgruszka@redhat•com, netdev@vger•kernel.org
Subject: Re: [PATCH] myr10ge: again fix lro_gen_skb() alignment
Date: Tue, 21 Apr 2009 15:19:14 -0400 [thread overview]
Message-ID: <49EE1C32.1060202@myri.com> (raw)
In-Reply-To: <20090416085022.GA19731@gondor.apana.org.au>
[-- Attachment #1: Type: text/plain, Size: 2075 bytes --]
Herbert Xu wrote:
> On Wed, Apr 15, 2009 at 04:42:48PM -0700, David Miller wrote:
>> Herbert has been working on various optimizations to get
>> cxgb3 GRO performance on par with LRO. Perhaps he has
>> some things for you to try :-)
>
> Yes, this patch should improve performace. In fact, when you
> reopen the net-next tree feel free to put this patch in :)
>
> gro: New frags interface to avoid copying shinfo
<...>
Hi Herbert,
With a net-next tree pulled 2 hours ago, I can now see line rate when
using frags with myri10ge on my weakest machines when receiving an
1500b TCP stream. To achieve line rate on these machines with both
inet_lro and GRO, I must bind the netserver and device IRQ to
different CPUs. Unfortunately, CPU accounting seems to currently be
broken in the Linux kernel, so I cannot provide an accurate comparison
at line rate.
So to compare inet_lro and GRO, I'm binding the netserver and device IRQ
to the same CPU. When I do this, that CPU is saturated and GRO is
roughly 17% slower than inet_lro. For comparison, here are netperf
results from a fast peer sending to my weak machine (AMD Athlon(tm) 64
X2 Dual Core Processor 3800+, 2GHz). First inet_lro:
Recv Send Send Utilization Service
Demand
Socket Socket Message Elapsed Send Recv Send Recv
Size Size Size Time Throughput local remote local
remote
bytes bytes bytes secs. 10^6bits/s % S % S us/KB us/KB
87380 65536 65536 60.02 6631.52 12.45 50.10 0.308
1.238
And now GRO:
87380 65536 65536 60.01 5488.99 9.79 50.00 0.292
1.492
Also, can you tell me how to handle my device, which passes a simple
16-bit checksum across the entire frame (excluding first 14 bytes),
via GRO? Simply setting skb->ip_summed = CHECKSUM_COMPLETE leads
to "hw csum failure".
I've attached my work-in-progress patch so you can see what I'm doing.
I do not want this applied due to performance and correctness issues.
Thanks for your help,
Drew
[-- Attachment #2: myri10ge_gro.diff --]
[-- Type: text/x-diff, Size: 9039 bytes --]
--- /home/gallatin/linux/git/net-next-2.6/drivers/net/myri10ge/myri10ge.c 2009-04-21 14:00:22.166783937 -0400
+++ linux-tmp/drivers/net/myri10ge/myri10ge.c 2009-04-21 15:08:06.241539153 -0400
@@ -75,7 +75,7 @@
#include "myri10ge_mcp.h"
#include "myri10ge_mcp_gen_header.h"
-#define MYRI10GE_VERSION_STR "1.4.4-1.412"
+#define MYRI10GE_VERSION_STR "1.4.4-1.413"
MODULE_DESCRIPTION("Myricom 10G driver (10GbE)");
MODULE_AUTHOR("Maintainer: help@myri•com");
@@ -161,8 +161,6 @@ struct myri10ge_rx_done {
dma_addr_t bus;
int cnt;
int idx;
- struct net_lro_mgr lro_mgr;
- struct net_lro_desc lro_desc[MYRI10GE_MAX_LRO_DESCRIPTORS];
};
struct myri10ge_slice_netstats {
@@ -1272,11 +1270,11 @@ myri10ge_unmap_rx_page(struct pci_dev *p
static inline int
myri10ge_rx_done(struct myri10ge_slice_state *ss, struct myri10ge_rx_buf *rx,
- int bytes, int len, __wsum csum)
+ struct napi_struct *napi, int bytes, int len, __wsum csum)
{
struct myri10ge_priv *mgp = ss->mgp;
struct sk_buff *skb;
- struct skb_frag_struct rx_frags[MYRI10GE_MAX_FRAGS_PER_FRAME];
+ struct skb_frag_struct *rx_frags;
int i, idx, hlen, remainder;
struct pci_dev *pdev = mgp->pdev;
struct net_device *dev = mgp->dev;
@@ -1286,6 +1284,19 @@ myri10ge_rx_done(struct myri10ge_slice_s
idx = rx->cnt & rx->mask;
va = page_address(rx->info[idx].page) + rx->info[idx].page_offset;
prefetch(va);
+
+ skb = napi_get_frags(napi);
+ if ((unlikely(!skb))) {
+ for (i = 0, remainder = len; remainder > 0; i++) {
+ myri10ge_unmap_rx_page(pdev, &rx->info[idx], bytes);
+ put_page(rx->info[idx].page);
+ rx->cnt++;
+ idx = rx->cnt & rx->mask;
+ remainder -= MYRI10GE_ALLOC_SIZE;
+ }
+ }
+ rx_frags = skb_shinfo(skb)->frags;
+
/* Fill skb_frag_struct(s) with data from our receive */
for (i = 0, remainder = len; remainder > 0; i++) {
myri10ge_unmap_rx_page(pdev, &rx->info[idx], bytes);
@@ -1300,52 +1311,18 @@ myri10ge_rx_done(struct myri10ge_slice_s
remainder -= MYRI10GE_ALLOC_SIZE;
}
- if (mgp->csum_flag && myri10ge_lro) {
- rx_frags[0].page_offset += MXGEFW_PAD;
- rx_frags[0].size -= MXGEFW_PAD;
- len -= MXGEFW_PAD;
- lro_receive_frags(&ss->rx_done.lro_mgr, rx_frags,
- /* opaque, will come back in get_frag_header */
- len, len,
- (void *)(__force unsigned long)csum, csum);
-
- return 1;
- }
-
- hlen = MYRI10GE_HLEN > len ? len : MYRI10GE_HLEN;
-
- /* allocate an skb to attach the page(s) to. This is done
- * after trying LRO, so as to avoid skb allocation overheads */
-
- skb = netdev_alloc_skb(dev, MYRI10GE_HLEN + 16);
- if (unlikely(skb == NULL)) {
- ss->stats.rx_dropped++;
- do {
- i--;
- put_page(rx_frags[i].page);
- } while (i != 0);
- return 0;
- }
-
- /* Attach the pages to the skb, and trim off any padding */
- myri10ge_rx_skb_build(skb, va, rx_frags, len, hlen);
- if (skb_shinfo(skb)->frags[0].size <= 0) {
- put_page(skb_shinfo(skb)->frags[0].page);
- skb_shinfo(skb)->nr_frags = 0;
- }
- skb->protocol = eth_type_trans(skb, dev);
- skb_record_rx_queue(skb, ss - &mgp->ss[0]);
-
- if (mgp->csum_flag) {
- if ((skb->protocol == htons(ETH_P_IP)) ||
- (skb->protocol == htons(ETH_P_IPV6))) {
- skb->csum = csum;
- skb->ip_summed = CHECKSUM_COMPLETE;
- } else
- myri10ge_vlan_ip_csum(skb, csum);
- }
- netif_receive_skb(skb);
+ rx_frags[0].page_offset += MXGEFW_PAD;
+ rx_frags[0].size -= MXGEFW_PAD;
+ len -= MXGEFW_PAD;
+ skb_shinfo(skb)->nr_frags = i;
+ skb->len = len;
+ skb->data_len = len;
+ skb->truesize += len;
+ /* skb->ip_summed = CHECKSUM_COMPLETE; */
+ skb->ip_summed = CHECKSUM_UNNECESSARY; /* XXXXXX */
+ napi_gro_frags(napi);
return 1;
+
}
static inline void
@@ -1418,7 +1395,8 @@ myri10ge_tx_done(struct myri10ge_slice_s
}
static inline int
-myri10ge_clean_rx_done(struct myri10ge_slice_state *ss, int budget)
+myri10ge_clean_rx_done(struct myri10ge_slice_state *ss,
+ struct napi_struct *napi, int budget)
{
struct myri10ge_rx_done *rx_done = &ss->rx_done;
struct myri10ge_priv *mgp = ss->mgp;
@@ -1438,10 +1416,12 @@ myri10ge_clean_rx_done(struct myri10ge_s
checksum = csum_unfold(rx_done->entry[idx].checksum);
if (length <= mgp->small_bytes)
rx_ok = myri10ge_rx_done(ss, &ss->rx_small,
+ napi,
mgp->small_bytes,
length, checksum);
else
rx_ok = myri10ge_rx_done(ss, &ss->rx_big,
+ napi,
mgp->big_bytes,
length, checksum);
rx_packets += rx_ok;
@@ -1455,9 +1435,6 @@ myri10ge_clean_rx_done(struct myri10ge_s
ss->stats.rx_packets += rx_packets;
ss->stats.rx_bytes += rx_bytes;
- if (myri10ge_lro)
- lro_flush_all(&rx_done->lro_mgr);
-
/* restock receive rings if needed */
if (ss->rx_small.fill_cnt - ss->rx_small.cnt < myri10ge_fill_thresh)
myri10ge_alloc_rx_pages(mgp, &ss->rx_small,
@@ -1522,7 +1499,7 @@ static int myri10ge_poll(struct napi_str
#endif
/* process as many rx events as NAPI will allow */
- work_done = myri10ge_clean_rx_done(ss, budget);
+ work_done = myri10ge_clean_rx_done(ss, napi, budget);
if (work_done < budget) {
napi_complete(napi);
@@ -1762,9 +1739,7 @@ static const char myri10ge_gstrings_slic
"----------- slice ---------",
"tx_pkt_start", "tx_pkt_done", "tx_req", "tx_done",
"rx_small_cnt", "rx_big_cnt",
- "wake_queue", "stop_queue", "tx_linearized", "LRO aggregated",
- "LRO flushed",
- "LRO avg aggr", "LRO no_desc"
+ "wake_queue", "stop_queue", "tx_linearized"
};
#define MYRI10GE_NET_STATS_LEN 21
@@ -1863,14 +1838,6 @@ myri10ge_get_ethtool_stats(struct net_de
data[i++] = (unsigned int)ss->tx.wake_queue;
data[i++] = (unsigned int)ss->tx.stop_queue;
data[i++] = (unsigned int)ss->tx.linearized;
- data[i++] = ss->rx_done.lro_mgr.stats.aggregated;
- data[i++] = ss->rx_done.lro_mgr.stats.flushed;
- if (ss->rx_done.lro_mgr.stats.flushed)
- data[i++] = ss->rx_done.lro_mgr.stats.aggregated /
- ss->rx_done.lro_mgr.stats.flushed;
- else
- data[i++] = 0;
- data[i++] = ss->rx_done.lro_mgr.stats.no_desc;
}
}
@@ -2198,67 +2165,6 @@ static void myri10ge_free_irq(struct myr
pci_disable_msix(pdev);
}
-static int
-myri10ge_get_frag_header(struct skb_frag_struct *frag, void **mac_hdr,
- void **ip_hdr, void **tcpudp_hdr,
- u64 * hdr_flags, void *priv)
-{
- struct ethhdr *eh;
- struct vlan_ethhdr *veh;
- struct iphdr *iph;
- u8 *va = page_address(frag->page) + frag->page_offset;
- unsigned long ll_hlen;
- /* passed opaque through lro_receive_frags() */
- __wsum csum = (__force __wsum) (unsigned long)priv;
-
- /* find the mac header, aborting if not IPv4 */
-
- eh = (struct ethhdr *)va;
- *mac_hdr = eh;
- ll_hlen = ETH_HLEN;
- if (eh->h_proto != htons(ETH_P_IP)) {
- if (eh->h_proto == htons(ETH_P_8021Q)) {
- veh = (struct vlan_ethhdr *)va;
- if (veh->h_vlan_encapsulated_proto != htons(ETH_P_IP))
- return -1;
-
- ll_hlen += VLAN_HLEN;
-
- /*
- * HW checksum starts ETH_HLEN bytes into
- * frame, so we must subtract off the VLAN
- * header's checksum before csum can be used
- */
- csum = csum_sub(csum, csum_partial(va + ETH_HLEN,
- VLAN_HLEN, 0));
- } else {
- return -1;
- }
- }
- *hdr_flags = LRO_IPV4;
-
- iph = (struct iphdr *)(va + ll_hlen);
- *ip_hdr = iph;
- if (iph->protocol != IPPROTO_TCP)
- return -1;
- if (iph->frag_off & htons(IP_MF | IP_OFFSET))
- return -1;
- *hdr_flags |= LRO_TCP;
- *tcpudp_hdr = (u8 *) (*ip_hdr) + (iph->ihl << 2);
-
- /* verify the IP checksum */
- if (unlikely(ip_fast_csum((u8 *) iph, iph->ihl)))
- return -1;
-
- /* verify the checksum */
- if (unlikely(csum_tcpudp_magic(iph->saddr, iph->daddr,
- ntohs(iph->tot_len) - (iph->ihl << 2),
- IPPROTO_TCP, csum)))
- return -1;
-
- return 0;
-}
-
static int myri10ge_get_txrx(struct myri10ge_priv *mgp, int slice)
{
struct myri10ge_cmd cmd;
@@ -2329,7 +2235,6 @@ static int myri10ge_open(struct net_devi
struct myri10ge_cmd cmd;
int i, status, big_pow2, slice;
u8 *itable;
- struct net_lro_mgr *lro_mgr;
if (mgp->running != MYRI10GE_ETH_STOPPED)
return -EBUSY;
@@ -2450,19 +2355,6 @@ static int myri10ge_open(struct net_devi
goto abort_with_rings;
}
- lro_mgr = &ss->rx_done.lro_mgr;
- lro_mgr->dev = dev;
- lro_mgr->features = LRO_F_NAPI;
- lro_mgr->ip_summed = CHECKSUM_COMPLETE;
- lro_mgr->ip_summed_aggr = CHECKSUM_UNNECESSARY;
- lro_mgr->max_desc = MYRI10GE_MAX_LRO_DESCRIPTORS;
- lro_mgr->lro_arr = ss->rx_done.lro_desc;
- lro_mgr->get_frag_header = myri10ge_get_frag_header;
- lro_mgr->max_aggr = myri10ge_lro_max_pkts;
- lro_mgr->frag_align_pad = 2;
- if (lro_mgr->max_aggr > MAX_SKB_FRAGS)
- lro_mgr->max_aggr = MAX_SKB_FRAGS;
-
/* must happen prior to any irq */
napi_enable(&(ss)->napi);
}
@@ -3910,7 +3802,7 @@ static int myri10ge_probe(struct pci_dev
if (dac_enabled)
netdev->features |= NETIF_F_HIGHDMA;
-
+ netdev->features |= NETIF_F_GRO;
/* make sure we can get an irq, and that MSI can be
* setup (if available). Also ensure netdev->irq
* is set to correct value if MSI is enabled */
next prev parent reply other threads:[~2009-04-21 19:20 UTC|newest]
Thread overview: 41+ messages / expand[flat|nested] mbox.gz Atom feed top
2009-04-15 8:09 [PATCH] myr10ge: again fix lro_gen_skb() alignment Stanislaw Gruszka
2009-04-15 9:28 ` David Miller
2009-04-15 9:48 ` Brice Goglin
2009-04-15 10:02 ` David Miller
2009-04-15 13:01 ` Andrew Gallatin
2009-04-15 21:04 ` Andrew Gallatin
2009-04-15 23:42 ` David Miller
2009-04-16 8:50 ` Herbert Xu
2009-04-16 9:02 ` David Miller
2009-04-21 19:19 ` Andrew Gallatin [this message]
2009-04-22 10:48 ` Herbert Xu
2009-04-22 15:37 ` Andrew Gallatin
2009-04-24 5:45 ` Herbert Xu
2009-04-24 12:45 ` Andrew Gallatin
2009-04-24 12:51 ` Herbert Xu
2009-04-24 17:13 ` Rick Jones
2009-04-24 16:16 ` Andrew Gallatin
2009-04-24 16:30 ` Herbert Xu
2009-04-24 16:31 ` Herbert Xu
2009-04-27 8:05 ` Herbert Xu
2009-04-27 8:07 ` Herbert Xu
2009-04-27 9:32 ` David Miller
2009-04-27 11:01 ` Herbert Xu
2009-04-27 12:45 ` David Miller
2009-04-27 12:45 ` David Miller
2009-04-28 6:12 ` Herbert Xu
2009-04-28 15:00 ` Andrew Gallatin
2009-04-28 15:02 ` David Miller
2009-04-28 15:20 ` Herbert Xu
2009-04-28 15:44 ` Andrew Gallatin
2009-04-28 21:12 ` Andrew Gallatin
2009-04-29 13:42 ` Andrew Gallatin
2009-04-29 13:53 ` Eric Dumazet
2009-04-29 14:18 ` Andrew Gallatin
2009-04-29 15:26 ` Eric Dumazet
2009-04-29 17:28 ` Andrew Gallatin
2009-04-30 8:10 ` Herbert Xu
2009-04-30 8:14 ` Herbert Xu
2009-04-30 8:17 ` Eric Dumazet
2009-04-30 19:14 ` Andrew Gallatin
2009-04-23 8:00 ` Herbert Xu
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=49EE1C32.1060202@myri.com \
--to=gallatin@myri$(echo .)com \
--cc=brice@myri$(echo .)com \
--cc=davem@davemloft$(echo .)net \
--cc=herbert@gondor$(echo .)apana.org.au \
--cc=netdev@vger$(echo .)kernel.org \
--cc=sgruszka@redhat$(echo .)com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox