Research Methodology
Download
Report
Transcript Research Methodology
Linux Network Architecture
Isaac Y. Tsai <[email protected]>
Outline
Linux kernel architecture
Network related kernel data structure
Network device driver
using igb as an example
2010/09/02 © by
Linux kernel architecture
2010/09/02 © by
Network related kernel data
structures
Protocol stack processing
struct sk_buff
defined in <linux/skbuff.h>
src <kernel src>/core/net/skbuff.c
NIC & protocol stack interface
struct net_device
defined in <linux/netdevice.h>
NIC I/O bus, e.g. PCI, USB, …
I/O bus specific data structure
2010/09/02 © by
struct sk_buff
Socket buffer (skb)
Kernel data structure containing control
information required for packet processing
A doubly linked list
When a payload data is passed to a socket, a
socket buffer is created and the payload
data address is stored in the structure
variable.
2010/09/02 © by
struct sk_buff
Header file : <linux/skbuff.h>
Implementation:
<kernel src>/net/core/skbuff.c
A pointer to network device
struct net_device *dev;
Pointers to protocol header
sk_buff_data_t transport_header,
network_header, mac_header;
Pointers to the whole packet
sk_buff_data_t
tail, end;
unsigned char
*head, *data;
2010/09/02 © by
struct sk_buff
2010/09/02 © by
struct sk_buff
2010/09/02 © by
struct sk_buff_head
2010/09/02 © by
head/data/tail/end fields
head: start of the
packet
data: start of packet
payload
tail: end of packet
payload
end: end of packet
len: amount of data
the packet contains
2010/09/02 © by
sk_buff functions
struct sk_buff *alloc_skb(unsigned int size, int gfp_mask)
void
kfree_skb(struct sk_buff *skb)
struct sk_buff *skb_get(struct sk_buff *skb)
struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask)
struct sk_buff *skb_copy(const struct sk_buff *skb, int gfp_mask)
struct skb_copy_expand(const struct sk_buff *skb, int
new_headroom, int new_tailroom, int gfp_mask)
int
skb_cloned(struct sk_buff *skb)
int
skb_shared(struct sk_buff *skb)
2010/09/02 © by
Operations on lists of sk_buff
struct sk_buff *skb_peek(struct sk_buff_head *list_)
struct sk_buff *skb_peek_tail(struct sk_buff_head *list_)
__u32
skb_queue_len(sk_buff_head *list_)
void
skb_queue_head(struct sk_buff_head *list_,
struct sk_buff *newsk)
void
skb_queue_tail(struct sk_buff_head *list_,
struct sk_buff *newsk)
2010/09/02 © by
Operations on sk_buff data
unsigned char *skb_put(struct sk_buff *sbk, int len)
unsigned char *skb_push(struct sk_buff *skb, int len)
unsigned char *skb_pull(struct sk_buff *skb, int len)
void
skb_reserve(struct sk_buff *skb, int len)
int
skb_headroom(struct sk_buff *skb)
int
skb_tailroom(struct sk_buff *skb)
struct
sk_buff *skb_cow(struct sk_buff *skb, int headroom)
2010/09/02 © by
alloc_skb
skb = kmem_cache_alloc(skbuff_head_cache, gfp_mask & ~_ _GFP_DMA);
... ... ...
size = SKB_DATA_ALIGN(size);
data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
2010/09/02 © by
a) skb_put()
b) skb_push()
c) skb_pull()
d) skb_reserve()
2010/09/02 © by
skb_reserve
skb_reserve(skb,2)
Ethernet frame size is 14bytes, this keeps the
following IP header to be aligned on 16bytes
boundary
2010/09/02 © by
TCP to IP sk_buff operations
2010/09/02 © by
UDP to IP sk_buff operations
2010/09/02 © by
skb_clone
When an ingress packet needs to be delivered to
multiple recipients
2010/09/02 © by
a) pkb_copy()
b) skb_copy()
2010/09/02 © by
Network Device Driver
2010/09/02 © by
Network Device Driver
2010/09/02 © by
struct net_dev
Represent a network interface card
Header: <linux/netdevice.h>
Name and index of the network device
State of the device
Device mtu: maximum transmission unit, the
maximum size of frame the device can handle
Pointers to device driver functions
2010/09/02 © by
Network device interface
2010/09/02 © by
Kernel net_device structure
dev_base stores registered network devices
2010/09/02 © by
struct net_device
Activation: open, close, ioctl
Data transfer: hard_start_xmit, poll
Watchdog: tx_timeout, watchdog_timeo
Statistics: get_stats, get_wireless_stats
Configuration: ethtool_ops, change_mtu
Bus specific: mem_start, men_end
struct net_dev_ops
2010/09/02 © by
struct net_device
struct net_device {
char
name[IFNAMSIZ];
struct hlist_node
name_hlist;
char
*ifalias;
unsigned long
mem_end, mem_start, base_addr;
unsigned int
irq;
unsigned char
if_port;
unsigned char
dma;
unsigned long
state;
struct list_head
dev_list, napi_list, unreg_list;
unsigned long
features;
int
ifindex, iflink;
struct net_device_stats stats;
#ifdef CONFIG_WIRELESS_EXT
const struct iw_handler_def *
wireless_handlers;
struct iw_public_data *
wireless_data;
#endif
const struct net_device_ops
*netdev_ops;
const struct ethtool_ops
*ethtool_ops;
const struct header_ops
*header_ops;
2010/09/02 © by
struct net_device (cont’ed)
…
unsigned int
flags;
unsigned short
gflags, priv_flags, padded;
unsigned char
operstate, link_mode;
unsigned
mtu;
unsigned short
type;
unsigned short
hard_header_len;
unsigned short
needed_headroom, needed_tailroom;
struct net_device
*master;
unsigned char
perm_addr[MAX_ADDR_LEN], addr_len;
unsigned short
dev_id;
struct netdev_hw_addr_list
uc;
int
uc_promisc;
spinlock_t
addr_list_lock;
struct dev_addr_list
*mc_list;
int
mc_count;
unsigned int
promiscuity, allmulti;
2010/09/02 © by
struct net_device_ops
struct net_device_ops {
int
void
int
int
netdev_tx_t
u16
void
void
void
int
int
int
int
int
int
void
struct net_device_stats*
void
*grp);
void
void
(*ndo_init)(struct net_device *dev);
(*ndo_uninit)(struct net_device *dev);
(*ndo_open)(struct net_device *dev);
(*ndo_stop)(struct net_device *dev);
(*ndo_start_xmit) (struct sk_buff *skb, struct net_device *dev);
(*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb);
(*ndo_change_rx_flags)(struct net_device *dev, int flags);
(*ndo_set_rx_mode)(struct net_device *dev);
(*ndo_set_multicast_list)(struct net_device *dev);
(*ndo_set_mac_address)(struct net_device *dev, void *addr);
(*ndo_validate_addr)(struct net_device *dev);
(*ndo_do_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd);
(*ndo_set_config)(struct net_device *dev, struct ifmap *map);
(*ndo_change_mtu)(struct net_device *dev, int new_mtu);
(*ndo_neigh_setup)(struct net_device *dev, struct neigh_parms *);
(*ndo_tx_timeout) (struct net_device *dev);
(*ndo_get_stats)(struct net_device *dev);
(*ndo_vlan_rx_register)(struct net_device *dev, struct vlan_group
(*ndo_vlan_rx_add_vid)(struct net_device *dev, unsigned short vid);
(*ndo_vlan_rx_kill_vid)(struct net_device *dev, unsigned short vid);
2010/09/02 © by
Interrupt based device driver
2010/09/02 © by
Device driver code flow
1. Detecting Hardware Device:
Once a network driver is loaded into the kernel, the driver probes for
the hardware device it supports (I/O ports and IRQ line). The device
found are to be registered with kernel.
2. Registration with kernel:
Usually linux drivers register itself with kernel, once it is loaded.
During the registration process it asks for its unique major/minor
number. There will be a corresponding file in /dev directory with
major/minor number allocated for that device (e.g.: /dev/hda – hard
disk partition). But when a network driver is loaded into kernel, it
does not ask for major/minor number as other drivers do. There is
no “everything is a file” concept for network device (it means there is
NO /dev/eth0 like file, similar to /dev/hda hard disk partition).
Instead, the network driver inserts a data structure (struct
net_device) for each newly detected interface into a global list of
network devices. This structure describes the characteristics of the
found device.
3. Filling up of net_device structure:
Kernel takes care of some ethernet defaults through a function
(ether_setup()), which fills several fields in the net_devie structure.
Device specific fields are filled by this device driver.
2010/09/02 © by
Device driver code flow (cont’d)
4. Opening (“open” method) the device:
(a) It requests and gets allocated its memory region and IRQs.
(b) The hardware address (“MAC address” popularly known as) is
copied from real hardware to net_device structure.
(c) Transmit Queue of this device is started (“netif_start_queue”)
to accept packets for transmission.
Note: Before the network device is used, it must be opened by
the kernel in response to “ifconfig / ifup” command. With this
command an IP address is assigned to the device and device
is made up (ON). Assigning IP address is happening at OSI
layer 3 (Network layer – IP), so this device driver (OSI layer 2 –
MAC) has nothing to do with that. But to make this device up,
IFF_UP flag of net_device structure is set. Kernel calls open
method of this device to do the same.
2010/09/02 © by
Device driver code flow (cont’d)
5. Transmission of Packet (“hard_start_xmit” method):
(a) Whenever the kernel needs to transmit a data packet, it calls the
“hard_start_xmit” method to put the data on an outgoing queue.
(b) Kernel put the data (packet) in the form of a structure called “socket
buffer structure” (struct sk_buff).
(c) Device driver does not modify this data and it does some sanity
checks only. Then it transmits the data by calling highly hardware
dependent routines of the device.
Note1: The “hard_start_xmit” function is protected from concurrent calls
by a spinlock (xmit_lock).
Note2: The hardware interface (ethernet card) has limited memory for
outgoing packets. When this memory is exhausted, the driver will tell
the kernel (“netif_stop_queue”) not to start any more transmissions
until the hardware is ready to accept new data. Once the driver has
stopped its queue, it must arrange to restart the queue at some
point in the future, when it is again able to accept packets for
transmission. To do so, it should call “netif_wake_queue” method.
2010/09/02 © by
Device driver code flow (cont’d)
Note3: If the current system time exceeds the device’s “trans_start” time
(which is set while a packet is transmitted) by at least the timeout
period, the networking layer will eventually call the driver’s
“tx_timeout” method. That method’s job is to clear up the problem
and to ensure the proper completion of any transmissions that were
already in progress.
6. Receiption of Packet:
(a) When a packet is arrived at hardware, it triggers the corresponding
interrupt. The interrupt handling routine of driver is called.
(b) This routine receives a pointer to the data and its length (packet),
which are already available in memory. Its responsibility is to send
the packet to the upper layers of networking code.
2010/09/02 © by
Device driver code flow (cont’d)
7. Closing/Releasing/Stopping (“stop” method) the device:
(a) It releases allocated memory and IRQs.
(b) Trasmit Queue of this device is stopped (“netif_stop_queue”)
from accepting packets for transmission.
Note: This method is called when we issue “ifdown <dev>”
command.
8. Changes in Link state:
The networking subsystem needs to know when network links
go up or down, and it provides a few functions that the driver
may use to convey that information. “netif_carrier_off”,
“netif_carrier_on” and “netif_carrier_ok” are such functions.
2010/09/02 © by
igb_main.c module constructor
static int __init igb_init_module(void) {
int ret;
printk(KERN_INFO "%s - version %s\n", igb_driver_string,igb_driver_version);
printk(KERN_INFO "%s\n", igb_copyright);
#ifdef IGB_DCA
dca_register_notify(&dca_notifier);
#endif
ret = pci_register_driver(&igb_driver);
#ifdef USE_REBOOT_NOTIFIER
if (ret >= 0) { register_reboot_notifier(&igb_notifier_reboot); }
#endif
#ifdef ENABLE_TNAPI
thread_proc_init();
#endif
return ret;
}
module_init(igb_init_module);
2010/09/02 © by
igb_driver variable
static struct pci_driver igb_driver = {
.name
= igb_driver_name,
.id_table = igb_pci_tbl,
.probe = igb_probe,
.remove = __devexit_p(igb_remove),
#ifdef CONFIG_PM
/* Power Managment Hooks */
.suspend = igb_suspend,
.resume = igb_resume,
#endif
#ifndef USE_REBOOT_NOTIFIER
.shutdown = igb_shutdown,
#endif
#ifdef HAVE_PCI_ERS
.err_handler = &igb_err_handler,
#endif
};
2010/09/02 © by
igb_netdev_ops variable
static const struct net_device_ops igb_netdev_ops = {
.ndo_open
= igb_open,
.ndo_stop
= igb_close,
.ndo_start_xmit
= igb_xmit_frame_adv,
.ndo_get_stats
= igb_get_stats,
.ndo_set_rx_mode
= igb_set_rx_mode,
.ndo_set_multicast_list
= igb_set_rx_mode,
.ndo_set_mac_address
= igb_set_mac,
.ndo_change_mtu
= igb_change_mtu,
.ndo_do_ioctl
= igb_ioctl,
.ndo_tx_timeout
= igb_tx_timeout,
.ndo_validate_addr
= eth_validate_addr,
.ndo_vlan_rx_register
= igb_vlan_rx_register,
.ndo_vlan_rx_add_vid
= igb_vlan_rx_add_vid,
.ndo_vlan_rx_kill_vid
= igb_vlan_rx_kill_vid,
#ifdef CONFIG_NET_POLL_CONTROLLER
.ndo_poll_controller
= igb_netpoll,
#endif
};
2010/09/02 © by
igb_main.c module deconstructor
static void __exit igb_exit_module(void){
#ifdef IGB_DCA
dca_unregister_notify(&dca_notifier);
#endif
#ifdef USE_REBOOT_NOTIFIER
unregister_reboot_notifier(&igb_notifier_reboot);
#endif
pci_unregister_driver(&igb_driver);
#ifdef ENABLE_TNAPI
thread_proc_term();
#endif
}
module_exit(igb_exit_module);
2010/09/02 © by
printk : kernel’s printf
Header: <linux/kernel.h>
Arguments are the same as printf
Format specifiers: similar to printf but no float and double
An initial 3 character sequence for log level
KERN_EMERG
KERN_ALERT
KERN_CRIT
KERN_ERR
KERN_WARNING
KERN_NOTICE
KERN_INFO
KERN_DEBUG
“<0>” /* system is unusable */
“<1>” /* action must be taken immediately */
“<2>” /* critical conditions */
“<3>” /* error conditions */
“<4>” /* warning conditions */
“<5>” /* normal but significant conditions */
“<6>” /* informational */
“<7>” /* debug-level messages */
2010/09/02 © by
Module
Basic headers
<linux/module.h>
<linux/version.h>
<linux/kernel.h>
MODULE_DEVICE_TABLE(pci, igb_pci_tbl);
MODULE_AUTHOR()
MODULE_DESCRIPTION()
MODULE_LICENSE()
MODULE_VERSION()
2010/09/02 © by
Net device registration
Header: <linux/netdevice.h>
Net device storage
struct net_device
*alloc_etherdev(sizeof_priv)
Registering net device
int register_netdev(struct net_device *)
void unregister_netdev(struct net_device *)
2010/09/02 © by
igb.h struct igb_adapter
struct igb_adapter {
struct timer_list watchdog_timer, phy_info_timer;
struct vlan_group *vlgrp;
u16 mng_vlan_id;
u32 bd_number, wol, en_mng_pt;
u16 link_speed, link_duplex;
unsigned int total_tx_bytes, total_tx_packets, total_rx_bytes, total_rx_packets;
/* Interrupt Throttle Rate */
u32 itr, itr_setting;
u16 tx_itr, rx_itr;
struct work_struct reset_task, watchdog_task;
bool fc_autoneg;
u8 tx_timeout_factor;
#ifdef ETHTOOL_PHYS_ID
struct timer_list blink_timer;
unsigned long led_status;
#endif
2010/09/02 © by
struct igb_adapter
/* TX */
struct igb_ring *tx_ring; /* One per active queue */
unsigned int restart_queue;
unsigned long tx_queue_len;
u32 tx_timeout_count;
/* RX */
struct igb_ring *rx_ring; /* One per active queue */
int num_tx_queues, num_rx_queues;
u64 hw_csum_err, hw_csum_good;
u32 alloc_rx_buff_failed, max_frame_size, min_frame_size;
/* OS defined structs */
struct net_device *netdev;
struct pci_dev *pdev;
struct net_device_stats net_stats;
2010/09/02 © by
driver init
Init
allocate ring buffers and associated sk_buff
allocate and initialize net_device
register the net_device
get MAC, and so on from device EEPROM
request firmware download, if needed
int request_firmware(fw,name,device)
register a packet receive interrupt
mostly postponed till open the device
2010/09/02 © by
igb_open()
static int igb_open(struct net_device *netdev)
{
struct igb_adapter *adapter = netdev_priv(netdev);
struct e1000_hw *hw = &adapter->hw;
int err, i;
if (test_bit(__IGB_TESTING, &adapter->state)) return -EBUSY;
err = igb_setup_all_tx_resources(adapter);
if (err) goto err_setup_tx;
err = igb_setup_all_rx_resources(adapter);
if (err) goto err_setup_rx;
igb_configure(adapter);
err = igb_request_irq(adapter);
if (err) goto err_req_irq;
clear_bit(__IGB_DOWN, &adapter->state);
for (i = 0; i < adapter->num_q_vectors; i++) {
struct igb_q_vector *q_vector = adapter->q_vector[i];
napi_enable(&q_vector->napi);
}
2010/09/02 © by
igb_open() (cont’ed)
igb_configure_lli(adapter);
E1000_READ_REG(hw, E1000_ICR);
igb_irq_enable(adapter);
if (adapter->vfs_allocated_count) {
u32 reg_data = E1000_READ_REG(hw, E1000_CTRL_EXT);
reg_data |= E1000_CTRL_EXT_PFRSTD;
E1000_WRITE_REG(hw, E1000_CTRL_EXT, reg_data);
}
netif_tx_start_all_queues(netdev); hw->mac.get_link_status = 1;
mod_timer(&adapter->watchdog_timer, jiffies + 1);
return E1000_SUCCESS;
err_req_irq:
igb_release_hw_control(adapter); igb_free_all_rx_resources(adapter);
err_setup_rx:
igb_free_all_tx_resources(adapter);
err_setup_tx:
igb_reset(adapter); return err;
}
2010/09/02 © by
Igb_close()
static int igb_close(struct net_device *netdev)
{
struct igb_adapter *adapter = netdev_priv(netdev);
WARN_ON(test_bit(__IGB_RESETTING, &adapter>state));
igb_down(adapter);
igb_free_irq(adapter);
igb_free_all_tx_resources(adapter);
igb_free_all_rx_resources(adapter);
return 0;
}
2010/09/02 © by
igb_xmit_frame_adv()
static netdev_tx_t igb_xmit_frame_adv(struct sk_buff *skb, struct
net_device *netdev)
{
struct igb_adapter *adapter = netdev_priv(netdev);
struct igb_ring *tx_ring;
#ifdef HAVE_TX_MQ
int r_idx = 0;
r_idx = skb->queue_mapping & (IGB_ABS_MAX_TX_QUEUES - 1);
tx_ring = adapter->multi_tx_table[r_idx];
#else
tx_ring = &adapter->tx_ring[0];
#endif
/* This goes back to the question of how to logically map a tx queue
* to a flow. Right now, performance is impacted slightly negatively
* if using multiple tx queues. If the stack breaks away from a
* single qdisc implementation, we can look at this again. */
return igb_xmit_frame_ring_adv(skb, netdev, tx_ring);
}
2010/09/02 © by
NAPI
NAPI (New API) : a device driver framework
for high-speed network
Interrupt mitigation
disable some interrupt during high traffic
Packet throttling
drop packets before further processing
2010/09/02 © by
NAPI interface
static inline void netif_napi_add(struct net_device *dev,
struct napi_struct *napi, int (*poll)(struct napi_struct *,
int), int weight)
static inline void napi_enable(struct napi_struct *n)
static inline void napi_disable(struct napi_struct *n)
static inline void netif_rx_schedule(struct net_device *dev,
struct napi_struct *napi)
static inline void netif_rx_complete(struct net_device *dev,
struct napi_struct *napi)
int netif_receive_skb(struct sk_buff *skb)
2010/09/02 © by
NAPI in packet reception
When a new packet is available, interrupt routine should disable
any further “packet available” interrupt and tell the network
subsystem to poll driver shortly to pick up all available
packets.
Arrange for polling:
void netif_rx_schedule(struct net_device *dev);
Create a poll() method in the driver
int (*poll)(struct net_device *dev, int *budget);
poll() should process all available incoming packets.
Packets should not passed to netif_rx(), instead, use
int netif_receive_skb(struct sk_buff *skb);
2010/09/02 © by
NAPI in packet reception (cont’ed)
A new struct net_device field called quota contains the maximum
number of packets that the networking subsystem is prepared
to receive from your driver at this time. Once you have
exhausted that quota, no further packets should be fed to the
kernel in this poll() call.
The budget parameter also places a limit on the number of
packets which your driver may process. Whichever of budget
and quota is lower is the real limit.
Your driver should decrement dev->quota by the number of
packets it processed. The value pointed to by the budget
parameter should also be decremented by the same amount.
If packets remain to be processed (i.e. the driver used its entire
quota), poll() should return a value of one.
If, instead, all packets have been processed, your driver should
reenable interrupts, turn off polling, and return zero. Polling
is stopped with:
void netif_rx_complete(struct net_device *dev);
2010/09/02 © by
Packet receive interrupt
Receive interrupt handler
minimally handles the packet received
sanity checks
puts back the sk_buffs for re-use
Passes the associated sk_buffs (and ring buffers)
to the protocol layer by NET_RX_SOFTIRQ
int netif_rx(stuct sk_buff *)
When network load is heavy, switch to poll mode,
if supported
2010/09/02 © by
igb_intr()
static irqreturn_t igb_intr(int irq, void *data)
{
struct igb_adapter
*adapter = data;
struct igb_q_vector
*q_vector = adapter->q_vector[0];
struct e1000_hw
*hw = &adapter->hw;
u32 icr = E1000_READ_REG(hw, E1000_ICR);
if (!icr) return IRQ_NONE; igb_write_itr(q_vector);
if (!(icr & E1000_ICR_INT_ASSERTED)) return IRQ_NONE;
if (icr & E1000_ICR_DOUTSYNC) { adapter->stats.doosync++; }
if (icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC)) {
hw->mac.get_link_status = 1;
if (!test_bit(__IGB_DOWN, &adapter->state))
mod_timer(&adapter->watchdog_timer, jiffies + 1);
}
napi_schedule(&q_vector->napi);
return IRQ_HANDLED;
}
2010/09/02 © by
igb_intr_msi()
static irqreturn_t igb_intr_msi(int irq, void *data)
{
struct igb_adapter *adapter = data;
struct igb_q_vector *q_vector = adapter->q_vector[0];
struct e1000_hw
*hw = &adapter->hw;
u32 icr = E1000_READ_REG(hw, E1000_ICR);
igb_write_itr(q_vector);
if (icr & E1000_ICR_DOUTSYNC) { adapter->stats.doosync++; }
if (icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC)) {
hw->mac.get_link_status = 1;
if (!test_bit(__IGB_DOWN, &adapter->state))
mod_timer(&adapter->watchdog_timer, jiffies + 1);
}
napi_schedule(&q_vector->napi);
return IRQ_HANDLED;
}
2010/09/02 © by
napi_schedule()
kcompat.h
#define napi_schedule(_napi)
netif_rx_schedule(napi_to_poll_dev(_napi))
kcompat.c
struct net_device *napi_to_poll_dev(struct napi_struct *napi)
{
struct adapter_q_vector *q_vector = container_of(napi,
struct adapter_q_vector, napi);
return &q_vector->poll_dev;
}
2010/09/02 © by
igb_poll()
static int igb_poll(struct napi_struct *napi, int budget)
{
struct igb_q_vector *q_vector = container_of(napi, struct igb_q_vector,
napi);
int tx_clean_complete = 1, work_done = 0;
#ifdef IGB_DCA
if (q_vector->adapter->flags & IGB_FLAG_DCA_ENABLED)
igb_update_dca(q_vector);
#endif
if (q_vector->tx_ring) tx_clean_complete = igb_clean_tx_irq(q_vector);
if (q_vector->rx_ring) igb_clean_rx_irq_adv(q_vector, &work_done,
budget);
if (!tx_clean_complete) work_done = budget;
#ifndef HAVE_NETDEV_NAPI_LIST
if (!netif_running(q_vector->adapter->netdev)) work_done = 0;
#endif
if (work_done < budget) {
napi_complete(napi); igb_ring_irq_enable(q_vector);
}
return work_done;
2010/09/02 © by
}
Calling sequence to install
igb_poll()
igb_open()
igb_request_irq(struct igb_adapter *adapter)
igb_alloc_q_vectors(struct igb_adapter *adapter)
netif_napi_add(adapter->netdev, &q_vector->napi, igb_poll, 64);
Igb_sw_init(), igb_resume()
igb_init_interrupt_scheme(struct igb_adapter *adapter)
igb_alloc_q_vectors(struct igb_adapter *adapter)
netif_napi_add(adapter->netdev, &q_vector->napi, igb_poll, 64);
igb_poll() will never be called if weight is not initialize and left as
zero. Gigabit adaptor drivers tend to set weight to 64; smaller
values can be used for slower media.
2010/09/02 © by
netif_napi_add()
#define netif_napi_add(_netdev, _napi, _poll, _weight) \
do { \
struct napi_struct *__napi = (_napi); \
struct net_device *poll_dev = napi_to_poll_dev(__napi); \
poll_dev->poll = &(__kc_adapter_clean); \
poll_dev->priv = (_napi); \
poll_dev->weight = (_weight); \
set_bit(__LINK_STATE_RX_SCHED, &poll_dev->state); \
set_bit(__LINK_STATE_START, &poll_dev->state);\
dev_hold(poll_dev); \
_netdev->poll = &(__kc_adapter_clean); \
_netdev->weight = (_weight); \
__napi->poll = &(_poll); \
__napi->weight = (_weight); \
__napi->dev = (_netdev); \
set_bit(__LINK_STATE_RX_SCHED, &(_netdev)->state); \
} while (0)
2010/09/02 © by
igb_clean_rx_irq_adv()
static bool igb_clean_rx_irq_adv(struct igb_q_vector *q_vector, int
*work_done, int budget)
{
struct igb_adapter *adapter = q_vector->adapter;
struct net_device
*netdev = adapter->netdev;
struct igb_ring
*rx_ring = q_vector->rx_ring;
struct pci_dev
*pdev = rx_ring->pdev;
union e1000_adv_rx_desc *rx_desc , *next_rxd;
struct igb_buffer *buffer_info , *next_buffer;
struct sk_buff *skb;
bool cleaned = FALSE;
int cleaned_count = 0;
unsigned int total_bytes = 0, total_packets = 0, i;
u32 staterr;
u16 length;
2010/09/02 © by
igb_clean_rx_irq_adv() (contin’ed)
i = rx_ring->next_to_clean;
buffer_info = &rx_ring->buffer_info[i];
rx_desc = E1000_RX_DESC_ADV(*rx_ring, i);
staterr = le32_to_cpu(rx_desc->wb.upper.status_error);
#ifdef ENABLE_TNAPI
if(!adapter->tnapi.shutdown) {
wake_up_interruptible(&adapter->tnapi.packet_waitqueue[rx_ring>queue_index]);
return TRUE;
}
#endif
while (staterr & E1000_RXD_STAT_DD) {
if (*work_done >= budget) break; (*work_done)++;
skb = buffer_info->skb; prefetch(skb->data - NET_IP_ALIGN);
buffer_info->skb = NULL; i++; if (i == rx_ring->count) i = 0;
next_rxd = E1000_RX_DESC_ADV(*rx_ring, i);
prefetch(next_rxd); next_buffer = &rx_ring->buffer_info[i];
length = le16_to_cpu(rx_desc->wb.upper.length);
cleaned = TRUE; cleaned_count++;
2010/09/02 © by
igb_clean_rx_irq_adv() (contin’ed)
send_up:
#endif /* CONFIG_IGB_DISABLE_PACKET_SPLIT */
if (staterr & E1000_RXDEXT_ERR_FRAME_ERR_MASK) {
dev_kfree_skb_irq(skb); goto next_desc;
}
#ifdef SIOCSHWTSTAMP
igb_rx_hwtstamp(adapter, staterr, skb);
#endif
total_bytes += skb->len; total_packets++;
igb_rx_checksum_adv(rx_ring, staterr, skb);
#ifndef ETH_TYPE_TRANS_SETS_DEV
skb->dev = netdev;
#endif
skb->protocol = eth_type_trans(skb, netdev);
igb_receive_skb(rx_ring, staterr, rx_desc, skb);
netdev->last_rx = jiffies;
2010/09/02 © by
igb_receive_skb()
igb_receive_skb()
#ifdef IGB_LRO
lro_vlan_hwaccel_receive_skb(&ring->lro_mgr,
skb, adapter->vlgrp, le16_to_cpu(rx_desc->wb.upper.vlan),
rx_desc);
lro_receive_skb(&ring->lro_mgr, skb, rx_desc);
#endif
vlan_gro_receive(&q_vector->napi, adapter->vlgrp,
le16_to_cpu(rx_desc->wb.upper.vlan), skb);
napi_gro_receive(&q_vector->napi, skb);
kcompat.h:
#define napi_gro_receive(_napi, _skb) netif_receive_skb(_skb)
2010/09/02 © by
lro_receive_skb()
<kernel src>/net/ipv4/inet_lro.c
void lro_receive_skb(struct net_lro_mgr *lro_mgr,
struct sk_buff *skb, void *priv)
{
if (__lro_proc_skb(lro_mgr, skb, NULL, 0, priv)) {
if (lro_mgr->features & LRO_F_NAPI)
netif_receive_skb(skb);
else
netif_rx(skb);
}
}
2010/09/02 © by
netif_receive_skb()
<kernel src>/net/core/dev.c
int netif_receive_skb(struct sk_buff *skb)
{
struct packet_type *ptype, *pt_prev;
struct net_device *orig_dev, *master, *null_or_orig,
*null_or_bond;
int ret = NET_RX_DROP;
__be16 type;
if (!skb->tstamp.tv64) net_timestamp(skb);
if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb))
return NET_RX_SUCCESS;
if (netpoll_receive_skb(skb)) return NET_RX_DROP;
if (!skb->skb_iif) skb->skb_iif = skb->dev->ifindex;
null_or_orig = NULL; orig_dev = skb->dev;
master = ACCESS_ONCE(orig_dev->master);
2010/09/02 © by
netif_receive_skb() (cont’ed)
if (master) {
if (skb_bond_should_drop(skb, master)) null_or_orig =
orig_dev;
else skb->dev = master;
}
__get_cpu_var(netdev_rx_stat).total++;
skb_reset_network_header(skb); skb_reset_transport_header(skb);
skb->mac_len = skb->network_header - skb->mac_header;
pt_prev = NULL;
rcu_read_lock();
#ifdef CONFIG_NET_CLS_ACT
if (skb->tc_verd & TC_NCLS) { skb->tc_verd = CLR_TC_NCLS(skb>tc_verd); goto ncls; }
#endif
list_for_each_entry_rcu(ptype, &ptype_all, list) {
if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
ptype->dev == orig_dev) {
if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
2010/09/02 © by
}
netif_receive_skb() (cont’ed)
#ifdef CONFIG_NET_CLS_ACT
skb = handle_ing(skb, &pt_prev, &ret, orig_dev); if (!skb) goto out;
ncls:
#endif
skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
if (!skb) goto out;
skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
if (!skb) goto out;
null_or_bond = NULL;
if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
(vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
null_or_bond = vlan_dev_real_dev(skb->dev);
}
type = skb->protocol;
2010/09/02 © by
netif_receive_skb() (cont’ed)
list_for_each_entry_rcu(ptype,&ptype_base[ntohs(type) &
PTYPE_HASH_MASK], list) {
if (ptype->type == type && (ptype->dev == null_or_orig ||
ptype->dev == skb->dev || ptype->dev == orig_dev ||
ptype->dev == null_or_bond)) {
if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
}
if (pt_prev) {
ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
} else {
kfree_skb(skb);
ret = NET_RX_DROP;
}
out:
rcu_read_unlock();
return ret;
}
2010/09/02 © by
Large segment offload
large segment offload (LSO) is a technique for increasing outbound
throughput of high-bandwidth network connections by reducing CPU
overhead. It works by queuing up large buffers and letting the NIC
split them into separate packets. The technique is also called TCP
segmentation offload (TSO) when applied to TCP, or generic
segmentation offload (GSO). When large chunks of data are to be
sent over a computer network, they need to be first broken down to
smaller segments that can pass through all the network elements
like routers and switches between the source and destination
computers. This process is referred to as segmentation.
Segmentation is often done by the TCP protocol in the host
computer. Offloading this work to the network card is called TCP
segmentation offload (TSO).
Large Receive Offload (LRO) is a technique for increasing inbound
throughput of high-bandwidth network connections by reducing CPU
overhead. It works by aggregating multiple incoming packets from a
single stream into a larger buffer before they are passed higher up
the networking stack, thus reducing the number of packets that have
to be processed. LRO combines multiple Ethernet frames into a
single receive in the stack, thereby potentially decreasing CPU
utilization for receives.
2010/09/02 © by
LRO and GRO
Generic Receive Offload (GRO) attempts to replicate the success of the
transmit-side offload mechanism TSO (TCP Segmentation Offload)
on the receive-side. This is crucial to the success of 10Gb/s Ethernet
as the standard MTU of 1500 imposes a huge burden on the CPU
which is no longer able to keep up without assistance. TSO is one of
the techniques devised to resolve this problem on the transmit-side,
i.e., the side that is of most interest to servers/data producers.
However, as data rates continue to increase, the receive side too
have become a bottleneck. Following in the footsteps of LRO (Large
receive offload), GRO attempts to resolve this problem without
causing conflicts with other parts of the network stack, such as
forwarding and bridging. The talk contains information aimed at a
general audience as well as technical details on the implementation.
Familiarity with the Linux kernel helps in the latter but everyone
should be able to gain a better understanding of the present
technologies in high-speed networking.
2010/09/02 © by
Flow control related APIs
For interaction with protocol layer
Header: <linux/netdevice.h>
APIs
void netif_start_queue(struct net_device *)
void netif_stop_queue(struct net_device *)
void netif_wake_queue(struct net_device *)
int netif_queue_stopped(struct net_device *)
2010/09/02 © by
netif_start_queue(),
netif_stop_queue()
<linux/netdevice.h>
static inline void netif_start_queue(struct net_device *dev){
netif_tx_start_queue(netdev_get_tx_queue(dev, 0));
}
static inline void netif_tx_start_queue(struct netdev_queue
*dev_queue){
clear_bit(__QUEUE_STATE_XOFF, &dev_queue->state);
}
static inline void netif_stop_queue(struct net_device *dev){
netif_tx_stop_queue(netdev_get_tx_queue(dev, 0));
}
static inline void netif_tx_stop_queue(struct netdev_queue
*dev_queue){
set_bit(__QUEUE_STATE_XOFF, &dev_queue->state);
}
2010/09/02 © by
netif_wake_queue(),
netif_queue_stopped()
static inline void netif_wake_queue(struct net_device *dev){
netif_tx_wake_queue(netdev_get_tx_queue(dev, 0));
}
static inline void netif_tx_wake_queue(struct netdev_queue *dev_queue){
#ifdef CONFIG_NETPOLL_TRAP
if (netpoll_trap()) {
netif_tx_start_queue(dev_queue);
return;
}
#endif
if (test_and_clear_bit(__QUEUE_STATE_XOFF, &dev_queue->state))
__netif_schedule(dev_queue->qdisc);
}
static inline int netif_queue_stopped(const struct net_device *dev){
return netif_tx_queue_stopped(netdev_get_tx_queue(dev, 0));
}
static inline int netif_tx_queue_stopped(const struct netdev_queue
*dev_queue)
{
return test_bit(__QUEUE_STATE_XOFF, &dev_queue->state);
}
2010/09/02 © by
References
Linux Network Architecture
[1] C. Benvenuti, Understanding Linux Network Internals, O'Reilly Media, 2005.
[2] J. Corbet, G. Kroah-Hartman, and A. Rubini, Linux Device Drivers, 3rd ed.,
O'Reilly, 2005.
[3] K. Wehrle, F. Pahlke, H. Ritter et al., Linux Network Architecture, Prentice
Hall, 2004.
TCP Performance
[1] A. P. Foong, T. R. Huff, H. H. Hum et al., “TCP Performance Re-Visited,” in
International Symposium on Performance Analysis of Systems and Software
(ISPASS'03), Austin, Texas, USA, 2003.
[2] L. Grossman, “Large Receive Offload Implementation in Neterion 10GbE
Ethernet Driver,” in Proceedings of the Linux Symposium, Ottawa, Ontario,
Canada, 2005, pp. 195-200.
[3] A. Menon, and W. Zwaenepoel, “Optimizing TCP receive performance,” in
USENIX 2008 Annual Technical Conference on Annual Technical Conference,
Boston, Massachusetts, USA, 2008, pp. 85-98.
2010/09/02 © by
Questions?
2010/09/02 © by