net/ipv4/ip_output.c

Name: Public Git Hosting - davej-history.git/blob - net/ipv4/ip_output.c
Rating: 4.9 (4258 reviews)
 1 /*
 2  * INET An implementation of the TCP/IP protocol suite for the LINUX
 3  * operating system. INET is implemented using the BSD Socket
 4  * interface as the means of communication with the user level.
 5  *
 6  * The Internet Protocol (IP) output module.
 7  *
 8  * Version: $Id: ip_output.c,v 1.72 1999/09/07 02:31:15 davem Exp $
 9  *
 10  * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
 11  * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 12  * Donald Becker, <becker@super.org>
 13  * Alan Cox, <Alan.Cox@linux.org>
 14  * Richard Underwood
 15  * Stefan Becker, <stefanb@yello.ping.de>
 16  * Jorge Cwik, <jorge@laser.satlink.net>
 17  * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 18  *
 19  * See ip_input.c for original log
 20  *
 21  * Fixes:
 22  * Alan Cox : Missing nonblock feature in ip_build_xmit.
 23  * Mike Kilburn : htons() missing in ip_build_xmit.
 24  * Bradford Johnson: Fix faulty handling of some frames when
 25  * no route is found.
 26  * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
 27  * (in case if packet not accepted by
 28  * output firewall rules)
 29  * Mike McLagan : Routing by source
 30  * Alexey Kuznetsov: use new route cache
 31  * Andi Kleen: Fix broken PMTU recovery and remove
 32  * some redundant tests.
 33  * Vitaly E. Lavrov : Transparent proxy revived after year coma.
 34  * Andi Kleen : Replace ip_reply with ip_send_reply.
 35  * Andi Kleen : Split fast and slow ip_build_xmit path
 36  * for decreased register pressure on x86
 37  * and more readibility.
 38  * Marc Boucher : When call_out_firewall returns FW_QUEUE,
 39  * silently drop skb instead of failing with -EPERM.
 40  */
 41
 42 #include <asm/uaccess.h>
 43 #include <asm/system.h>
 44 #include <linux/types.h>
 45 #include <linux/kernel.h>
 46 #include <linux/sched.h>
 47 #include <linux/mm.h>
 48 #include <linux/string.h>
 49 #include <linux/errno.h>
 50 #include <linux/config.h>
 51
 52 #include <linux/socket.h>
 53 #include <linux/sockios.h>
 54 #include <linux/in.h>
 55 #include <linux/inet.h>
 56 #include <linux/netdevice.h>
 57 #include <linux/etherdevice.h>
 58 #include <linux/proc_fs.h>
 59 #include <linux/stat.h>
 60 #include <linux/init.h>
 61
 62 #include <net/snmp.h>
 63 #include <net/ip.h>
 64 #include <net/protocol.h>
 65 #include <net/route.h>
 66 #include <net/tcp.h>
 67 #include <net/udp.h>
 68 #include <linux/skbuff.h>
 69 #include <net/sock.h>
 70 #include <net/arp.h>
 71 #include <net/icmp.h>
 72 #include <net/raw.h>
 73 #include <net/checksum.h>
 74 #include <linux/igmp.h>
 75 #include <linux/netfilter_ipv4.h>
 76 #include <linux/mroute.h>
 77 #include <linux/netlink.h>
 78
 79 /*
 80  * Shall we try to damage output packets if routing dev changes?
 81  */
 82
 83 int sysctl_ip_dynaddr =0;
 84
 85 int ip_id_count =0;
 86
 87 /* Generate a checksum for an outgoing IP datagram. */
 88 __inline__ voidip_send_check(struct iphdr *iph)
 89 {
 90  iph->check =0;
 91  iph->check =ip_fast_csum((unsigned char*)iph, iph->ihl);
 92 }
 93
 94 /* dev_loopback_xmit for use with netfilter. */
 95 static intip_dev_loopback_xmit(struct sk_buff *newskb)
 96 {
 97  newskb->mac.raw = newskb->data;
 98 skb_pull(newskb, newskb->nh.raw - newskb->data);
 99  newskb->pkt_type = PACKET_LOOPBACK;
 100  newskb->ip_summed = CHECKSUM_UNNECESSARY;
 101 BUG_TRAP(newskb->dst);
 102
 103 #ifdef CONFIG_NETFILTER_DEBUG
 104 nf_debug_ip_loopback_xmit(newskb);
 105 #endif
 106 netif_rx(newskb);
 107 return0;
 108 }
 109
 110 #ifdef CONFIG_NETFILTER
 111 /* To preserve the cute illusion that a locally-generated packet can
 112  be mangled before routing, we actually reroute if a hook altered
 113  the packet. -RR */
 114 static introute_me_harder(struct sk_buff *skb)
 115 {
 116 struct iphdr *iph = skb->nh.iph;
 117 struct rtable *rt;
 118
 119 if(ip_route_output(&rt, iph->daddr, iph->saddr,
 120 RT_TOS(iph->tos) | RTO_CONN,
 121  skb->sk ? skb->sk->bound_dev_if :0)) {
 122 printk("route_me_harder: No more route.\n");
 123 return-EINVAL;
 124 }
 125
 126 /* Drop old route. */
 127 dst_release(skb->dst);
 128
 129  skb->dst = &rt->u.dst;
 130 return0;
 131 }
 132 #endif
 133
 134 /* Do route recalc if netfilter changes skb. */
 135 staticinlineint
 136 output_maybe_reroute(struct sk_buff *skb)
 137 {
 138 #ifdef CONFIG_NETFILTER
 139 if(skb->nfcache & NFC_ALTERED) {
 140 if(route_me_harder(skb) !=0) {
 141 kfree_skb(skb);
 142 return-EINVAL;
 143 }
 144 }
 145 #endif
 146 return skb->dst->output(skb);
 147 }
 148
 149 /*
 150  * Add an ip header to a skbuff and send it out.
 151  */
 152 voidip_build_and_send_pkt(struct sk_buff *skb,struct sock *sk,
 153  u32 saddr, u32 daddr,struct ip_options *opt)
 154 {
 155 struct rtable *rt = (struct rtable *)skb->dst;
 156 struct iphdr *iph;
 157
 158 /* Build the IP header. */
 159 if(opt)
 160  iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
 161 else
 162  iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
 163
 164  iph->version =4;
 165  iph->ihl =5;
 166  iph->tos = sk->protinfo.af_inet.tos;
 167  iph->frag_off =0;
 168 if(ip_dont_fragment(sk, &rt->u.dst))
 169  iph->frag_off |=htons(IP_DF);
 170  iph->ttl = sk->protinfo.af_inet.ttl;
 171  iph->daddr = rt->rt_dst;
 172  iph->saddr = rt->rt_src;
 173  iph->protocol = sk->protocol;
 174  iph->tot_len =htons(skb->len);
 175  iph->id =htons(ip_id_count++);
 176  skb->nh.iph = iph;
 177
 178 if(opt && opt->optlen) {
 179  iph->ihl += opt->optlen>>2;
 180 ip_options_build(skb, opt, daddr, rt,0);
 181 }
 182 ip_send_check(iph);
 183
 184 /* Send it out. */
 185 NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, NULL,
 186  output_maybe_reroute);
 187 }
 188
 189 staticinlineintip_finish_output2(struct sk_buff *skb)
 190 {
 191 struct dst_entry *dst = skb->dst;
 192 struct hh_cache *hh = dst->hh;
 193
 194 #ifdef CONFIG_NETFILTER_DEBUG
 195 nf_debug_ip_finish_output2(skb);
 196 #endif/*CONFIG_NETFILTER_DEBUG*/
 197
 198 if(hh) {
 199 read_lock_bh(&hh->hh_lock);
 200 memcpy(skb->data -16, hh->hh_data,16);
 201 read_unlock_bh(&hh->hh_lock);
 202 skb_push(skb, hh->hh_len);
 203 return hh->hh_output(skb);
 204 }else if(dst->neighbour)
 205 return dst->neighbour->output(skb);
 206
 207 printk(KERN_DEBUG "khm\n");
 208 kfree_skb(skb);
 209 return-EINVAL;
 210 }
 211
 212 __inline__ intip_finish_output(struct sk_buff *skb)
 213 {
 214 struct net_device *dev = skb->dst->dev;
 215
 216  skb->dev = dev;
 217  skb->protocol =__constant_htons(ETH_P_IP);
 218
 219 returnNF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
 220  ip_finish_output2);
 221 }
 222
 223 intip_mc_output(struct sk_buff *skb)
 224 {
 225 struct sock *sk = skb->sk;
 226 struct rtable *rt = (struct rtable*)skb->dst;
 227 struct net_device *dev = rt->u.dst.dev;
 228
 229 /*
 230  * If the indicated interface is up and running, send the packet.
 231  */
 232  ip_statistics.IpOutRequests++;
 233 #ifdef CONFIG_IP_ROUTE_NAT
 234 if(rt->rt_flags & RTCF_NAT)
 235 ip_do_nat(skb);
 236 #endif
 237
 238  skb->dev = dev;
 239  skb->protocol =__constant_htons(ETH_P_IP);
 240
 241 /*
 242  * Multicasts are looped back for other local users
 243  */
 244
 245 if(rt->rt_flags&RTCF_MULTICAST && (!sk || sk->protinfo.af_inet.mc_loop)) {
 246 #ifdef CONFIG_IP_MROUTE
 247 /* Small optimization: do not loopback not local frames,
 248  which returned after forwarding; they will be dropped
 249  by ip_mr_input in any case.
 250  Note, that local frames are looped back to be delivered
 251  to local recipients.
 252
 253  This check is duplicated in ip_mr_input at the moment.
 254  */
 255 if((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
 256 #endif
 257 {
 258 struct sk_buff *newskb =skb_clone(skb, GFP_ATOMIC);
 259 if(newskb)
 260 NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, newskb, NULL,
 261  newskb->dev,
 262  ip_dev_loopback_xmit);
 263 }
 264
 265 /* Multicasts with ttl 0 must not go beyond the host */
 266
 267 if(skb->nh.iph->ttl ==0) {
 268 kfree_skb(skb);
 269 return0;
 270 }
 271 }
 272
 273 if(rt->rt_flags&RTCF_BROADCAST) {
 274 struct sk_buff *newskb =skb_clone(skb, GFP_ATOMIC);
 275 if(newskb)
 276 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
 277  newskb->dev, ip_dev_loopback_xmit);
 278 }
 279
 280 returnip_finish_output(skb);
 281 }
 282
 283 intip_output(struct sk_buff *skb)
 284 {
 285 #ifdef CONFIG_IP_ROUTE_NAT
 286 struct rtable *rt = (struct rtable*)skb->dst;
 287 #endif
 288
 289  ip_statistics.IpOutRequests++;
 290
 291 #ifdef CONFIG_IP_ROUTE_NAT
 292 if(rt->rt_flags&RTCF_NAT)
 293 ip_do_nat(skb);
 294 #endif
 295
 296 returnip_finish_output(skb);
 297 }
 298
 299 /* Queues a packet to be sent, and starts the transmitter if necessary.
 300  * This routine also needs to put in the total length and compute the
 301  * checksum. We use to do this in two stages, ip_build_header() then
 302  * this, but that scheme created a mess when routes disappeared etc.
 303  * So we do it all here, and the TCP send engine has been changed to
 304  * match. (No more unroutable FIN disasters, etc. wheee...) This will
 305  * most likely make other reliable transport layers above IP easier
 306  * to implement under Linux.
 307  */
 308 staticinlineintip_queue_xmit2(struct sk_buff *skb)
 309 {
 310 struct sock *sk = skb->sk;
 311 struct rtable *rt = (struct rtable *)skb->dst;
 312 struct net_device *dev;
 313 struct iphdr *iph = skb->nh.iph;
 314
 315 #ifdef CONFIG_NETFILTER
 316 /* BLUE-PEN-FOR-ALEXEY. I don't understand; you mean I can't
 317  hold the route as I pass the packet to userspace? -- RR
 318
 319  You may hold it, if you really hold it. F.e. if netfilter
 320  does not destroy handed skb with skb->dst attached, it
 321  will be held. When it was stored in info->arg, then
 322  it was not held apparently. Now (without second arg) it is evident,
 323  that it is clean. --ANK
 324  */
 325 if(rt==NULL || (skb->nfcache & NFC_ALTERED)) {
 326 if(route_me_harder(skb) !=0) {
 327 kfree_skb(skb);
 328 return-EHOSTUNREACH;
 329 }
 330 }
 331 #endif
 332
 333  dev = rt->u.dst.dev;
 334
 335 /* This can happen when the transport layer has segments queued
 336  * with a cached route, and by the time we get here things are
 337  * re-routed to a device with a different MTU than the original
 338  * device. Sick, but we must cover it.
 339  */
 340 if(skb_headroom(skb) < dev->hard_header_len && dev->hard_header) {
 341 struct sk_buff *skb2;
 342
 343  skb2 =skb_realloc_headroom(skb, (dev->hard_header_len +15) & ~15);
 344 kfree_skb(skb);
 345 if(skb2 == NULL)
 346 return-ENOMEM;
 347 if(sk)
 348 skb_set_owner_w(skb2, sk);
 349  skb = skb2;
 350  iph = skb->nh.iph;
 351 }
 352
 353 if(skb->len > rt->u.dst.pmtu)
 354 goto fragment;
 355
 356 if(ip_dont_fragment(sk, &rt->u.dst))
 357  iph->frag_off |=__constant_htons(IP_DF);
 358
 359 /* Add an IP checksum. */
 360 ip_send_check(iph);
 361
 362  skb->priority = sk->priority;
 363 return skb->dst->output(skb);
 364
 365 fragment:
 366 if(ip_dont_fragment(sk, &rt->u.dst)) {
 367 /* Reject packet ONLY if TCP might fragment
 368  * it itself, if were careful enough.
 369  */
 370  iph->frag_off |=__constant_htons(IP_DF);
 371 NETDEBUG(printk(KERN_DEBUG "sending pkt_too_big to self\n"));
 372
 373 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 374 htonl(rt->u.dst.pmtu));
 375 kfree_skb(skb);
 376 return-EMSGSIZE;
 377 }
 378 returnip_fragment(skb, skb->dst->output);
 379 }
 380
 381 intip_queue_xmit(struct sk_buff *skb)
 382 {
 383 struct sock *sk = skb->sk;
 384 struct ip_options *opt = sk->protinfo.af_inet.opt;
 385 struct rtable *rt;
 386 struct iphdr *iph;
 387
 388 /* Make sure we can route this packet. */
 389  rt = (struct rtable *)__sk_dst_check(sk,0);
 390 if(rt == NULL) {
 391  u32 daddr;
 392
 393 /* Use correct destination address if we have options. */
 394  daddr = sk->daddr;
 395 if(opt && opt->srr)
 396  daddr = opt->faddr;
 397
 398 /* If this fails, retransmit mechanism of transport layer will
 399  * keep trying until route appears or the connection times itself
 400  * out.
 401  */
 402 if(ip_route_output(&rt, daddr, sk->saddr,
 403 RT_TOS(sk->protinfo.af_inet.tos) | RTO_CONN | sk->localroute,
 404  sk->bound_dev_if))
 405 goto no_route;
 406 __sk_dst_set(sk, &rt->u.dst);
 407 }
 408  skb->dst =dst_clone(&rt->u.dst);
 409
 410 if(opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
 411 goto no_route;
 412
 413 /* OK, we know where to send it, allocate and build IP header. */
 414  iph = (struct iphdr *)skb_push(skb,sizeof(struct iphdr) + (opt ? opt->optlen :0));
 415  iph->version =4;
 416  iph->ihl =5;
 417  iph->tos = sk->protinfo.af_inet.tos;
 418  iph->frag_off =0;
 419  iph->ttl = sk->protinfo.af_inet.ttl;
 420  iph->daddr = rt->rt_dst;
 421  iph->saddr = rt->rt_src;
 422  iph->protocol = sk->protocol;
 423  skb->nh.iph = iph;
 424 /* Transport layer set skb->h.foo itself. */
 425
 426 if(opt && opt->optlen) {
 427  iph->ihl += opt->optlen >>2;
 428 ip_options_build(skb, opt, sk->daddr, rt,0);
 429 }
 430
 431  iph->tot_len =htons(skb->len);
 432  iph->id =htons(ip_id_count++);
 433
 434 returnNF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 435  ip_queue_xmit2);
 436
 437 no_route:
 438  ip_statistics.IpOutNoRoutes++;
 439 kfree_skb(skb);
 440 return-EHOSTUNREACH;
 441 }
 442
 443 /*
 444  * Build and send a packet, with as little as one copy
 445  *
 446  * Doesn't care much about ip options... option length can be
 447  * different for fragment at 0 and other fragments.
 448  *
 449  * Note that the fragment at the highest offset is sent first,
 450  * so the getfrag routine can fill in the TCP/UDP checksum header
 451  * field in the last fragment it sends... actually it also helps
 452  * the reassemblers, they can put most packets in at the head of
 453  * the fragment queue, and they know the total size in advance. This
 454  * last feature will measurably improve the Linux fragment handler one
 455  * day.
 456  *
 457  * The callback has five args, an arbitrary pointer (copy of frag),
 458  * the source IP address (may depend on the routing table), the
 459  * destination address (char *), the offset to copy from, and the
 460  * length to be copied.
 461  */
 462
 463 static intip_build_xmit_slow(struct sock *sk,
 464 intgetfrag(const void*,
 465 char*,
 466 unsigned int,
 467 unsigned int),
 468 const void*frag,
 469 unsigned length,
 470 struct ipcm_cookie *ipc,
 471 struct rtable *rt,
 472 int flags)
 473 {
 474 unsigned int fraglen, maxfraglen, fragheaderlen;
 475 int err;
 476 int offset, mf;
 477 int mtu;
 478 unsigned short id;
 479
 480 int hh_len = (rt->u.dst.dev->hard_header_len +15)&~15;
 481 int nfrags=0;
 482 struct ip_options *opt = ipc->opt;
 483 int df =0;
 484
 485  mtu = rt->u.dst.pmtu;
 486 if(ip_dont_fragment(sk, &rt->u.dst))
 487  df =htons(IP_DF);
 488
 489  length -=sizeof(struct iphdr);
 490
 491 if(opt) {
 492  fragheaderlen =sizeof(struct iphdr) + opt->optlen;
 493  maxfraglen = ((mtu-sizeof(struct iphdr)-opt->optlen) & ~7) + fragheaderlen;
 494 }else{
 495  fragheaderlen =sizeof(struct iphdr);
 496
 497 /*
 498  * Fragheaderlen is the size of 'overhead' on each buffer. Now work
 499  * out the size of the frames to send.
 500  */
 501
 502  maxfraglen = ((mtu-sizeof(struct iphdr)) & ~7) + fragheaderlen;
 503 }
 504
 505 if(length + fragheaderlen >0xFFFF) {
 506 ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, mtu);
 507 return-EMSGSIZE;
 508 }
 509
 510 /*
 511  * Start at the end of the frame by handling the remainder.
 512  */
 513
 514  offset = length - (length % (maxfraglen - fragheaderlen));
 515
 516 /*
 517  * Amount of memory to allocate for final fragment.
 518  */
 519
 520  fraglen = length - offset + fragheaderlen;
 521
 522 if(length-offset==0) {
 523  fraglen = maxfraglen;
 524  offset -= maxfraglen-fragheaderlen;
 525 }
 526
 527 /*
 528  * The last fragment will not have MF (more fragments) set.
 529  */
 530
 531  mf =0;
 532
 533 /*
 534  * Don't fragment packets for path mtu discovery.
 535  */
 536
 537 if(offset >0&& df) {
 538 ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, mtu);
 539 return-EMSGSIZE;
 540 }
 541 if(flags&MSG_PROBE)
 542 goto out;
 543
 544 /*
 545  * Get an identifier
 546  */
 547
 548  id =htons(ip_id_count++);
 549
 550 /*
 551  * Begin outputting the bytes.
 552  */
 553
 554 do{
 555 char*data;
 556 struct sk_buff * skb;
 557
 558 /*
 559  * Get the memory we require with some space left for alignment.
 560  */
 561
 562  skb =sock_alloc_send_skb(sk, fraglen+hh_len+15,0, flags&MSG_DONTWAIT, &err);
 563 if(skb == NULL)
 564 goto error;
 565
 566 /*
 567  * Fill in the control structures
 568  */
 569
 570  skb->priority = sk->priority;
 571  skb->dst =dst_clone(&rt->u.dst);
 572 skb_reserve(skb, hh_len);
 573
 574 /*
 575  * Find where to start putting bytes.
 576  */
 577
 578  data =skb_put(skb, fraglen);
 579  skb->nh.iph = (struct iphdr *)data;
 580
 581 /*
 582  * Only write IP header onto non-raw packets
 583  */
 584
 585 {
 586 struct iphdr *iph = (struct iphdr *)data;
 587
 588  iph->version =4;
 589  iph->ihl =5;
 590 if(opt) {
 591  iph->ihl += opt->optlen>>2;
 592 ip_options_build(skb, opt,
 593  ipc->addr, rt, offset);
 594 }
 595  iph->tos = sk->protinfo.af_inet.tos;
 596  iph->tot_len =htons(fraglen - fragheaderlen + iph->ihl*4);
 597  iph->id = id;
 598  iph->frag_off =htons(offset>>3);
 599  iph->frag_off |= mf|df;
 600 if(rt->rt_type == RTN_MULTICAST)
 601  iph->ttl = sk->protinfo.af_inet.mc_ttl;
 602 else
 603  iph->ttl = sk->protinfo.af_inet.ttl;
 604  iph->protocol = sk->protocol;
 605  iph->check =0;
 606  iph->saddr = rt->rt_src;
 607  iph->daddr = rt->rt_dst;
 608  iph->check =ip_fast_csum((unsigned char*)iph, iph->ihl);
 609  data += iph->ihl*4;
 610
 611 /*
 612  * Any further fragments will have MF set.
 613  */
 614
 615  mf =htons(IP_MF);
 616 }
 617
 618 /*
 619  * User data callback
 620  */
 621
 622 if(getfrag(frag, data, offset, fraglen-fragheaderlen)) {
 623  err = -EFAULT;
 624 kfree_skb(skb);
 625 goto error;
 626 }
 627
 628  offset -= (maxfraglen-fragheaderlen);
 629  fraglen = maxfraglen;
 630
 631  nfrags++;
 632
 633  err =NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
 634  skb->dst->dev, output_maybe_reroute);
 635 if(err) {
 636 if(err >0)
 637  err = sk->protinfo.af_inet.recverr ?net_xmit_errno(err) :0;
 638 if(err)
 639 goto error;
 640 }
 641 }while(offset >=0);
 642
 643 if(nfrags>1)
 644  ip_statistics.IpFragCreates += nfrags;
 645 out:
 646 return0;
 647
 648 error:
 649  ip_statistics.IpOutDiscards++;
 650 if(nfrags>1)
 651  ip_statistics.IpFragCreates += nfrags;
 652 return err;
 653 }
 654
 655 /*
 656  * Fast path for unfragmented packets.
 657  */
 658 intip_build_xmit(struct sock *sk,
 659 intgetfrag(const void*,
 660 char*,
 661 unsigned int,
 662 unsigned int),
 663 const void*frag,
 664 unsigned length,
 665 struct ipcm_cookie *ipc,
 666 struct rtable *rt,
 667 int flags)
 668 {
 669 int err;
 670 struct sk_buff *skb;
 671 int df;
 672 struct iphdr *iph;
 673
 674 /*
 675  * Try the simple case first. This leaves fragmented frames, and by
 676  * choice RAW frames within 20 bytes of maximum size(rare) to the long path
 677  */
 678
 679 if(!sk->protinfo.af_inet.hdrincl) {
 680  length +=sizeof(struct iphdr);
 681
 682 /*
 683  * Check for slow path.
 684  */
 685 if(length > rt->u.dst.pmtu || ipc->opt != NULL)
 686 returnip_build_xmit_slow(sk,getfrag,frag,length,ipc,rt,flags);
 687 }else{
 688 if(length > rt->u.dst.dev->mtu) {
 689 ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, rt->u.dst.dev->mtu);
 690 return-EMSGSIZE;
 691 }
 692 }
 693 if(flags&MSG_PROBE)
 694 goto out;
 695
 696 /*
 697  * Do path mtu discovery if needed.
 698  */
 699  df =0;
 700 if(ip_dont_fragment(sk, &rt->u.dst))
 701  df =htons(IP_DF);
 702
 703 /*
 704  * Fast path for unfragmented frames without options.
 705  */
 706 {
 707 int hh_len = (rt->u.dst.dev->hard_header_len +15)&~15;
 708
 709  skb =sock_alloc_send_skb(sk, length+hh_len+15,
 710 0, flags&MSG_DONTWAIT, &err);
 711 if(skb==NULL)
 712 goto error;
 713 skb_reserve(skb, hh_len);
 714 }
 715
 716  skb->priority = sk->priority;
 717  skb->dst =dst_clone(&rt->u.dst);
 718
 719  skb->nh.iph = iph = (struct iphdr *)skb_put(skb, length);
 720
 721 if(!sk->protinfo.af_inet.hdrincl) {
 722  iph->version=4;
 723  iph->ihl=5;
 724  iph->tos=sk->protinfo.af_inet.tos;
 725  iph->tot_len =htons(length);
 726  iph->id=htons(ip_id_count++);
 727  iph->frag_off = df;
 728  iph->ttl=sk->protinfo.af_inet.mc_ttl;
 729 if(rt->rt_type != RTN_MULTICAST)
 730  iph->ttl=sk->protinfo.af_inet.ttl;
 731  iph->protocol=sk->protocol;
 732  iph->saddr=rt->rt_src;
 733  iph->daddr=rt->rt_dst;
 734  iph->check=0;
 735  iph->check =ip_fast_csum((unsigned char*)iph, iph->ihl);
 736  err =getfrag(frag, ((char*)iph)+iph->ihl*4,0, length-iph->ihl*4);
 737 }
 738 else
 739  err =getfrag(frag, (void*)iph,0, length);
 740
 741 if(err)
 742 goto error_fault;
 743
 744  err =NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 745  output_maybe_reroute);
 746 if(err >0)
 747  err = sk->protinfo.af_inet.recverr ?net_xmit_errno(err) :0;
 748 if(err)
 749 goto error;
 750 out:
 751 return0;
 752
 753 error_fault:
 754  err = -EFAULT;
 755 kfree_skb(skb);
 756 error:
 757  ip_statistics.IpOutDiscards++;
 758 return err;
 759 }
 760
 761
 762
 763 /*
 764  * This IP datagram is too large to be sent in one piece. Break it up into
 765  * smaller pieces (each of size equal to IP header plus
 766  * a block of the data of the original IP data part) that will yet fit in a
 767  * single device frame, and queue such a frame for sending.
 768  *
 769  * Yes this is inefficient, feel free to submit a quicker one.
 770  */
 771
 772 intip_fragment(struct sk_buff *skb,int(*output)(struct sk_buff*))
 773 {
 774 struct iphdr *iph;
 775 unsigned char*raw;
 776 unsigned char*ptr;
 777 struct net_device *dev;
 778 struct sk_buff *skb2;
 779 unsigned int mtu, hlen, left, len;
 780 int offset;
 781 int not_last_frag;
 782 struct rtable *rt = (struct rtable*)skb->dst;
 783 int err =0;
 784
 785  dev = rt->u.dst.dev;
 786
 787 /*
 788  * Point into the IP datagram header.
 789  */
 790
 791  raw = skb->nh.raw;
 792  iph = (struct iphdr*)raw;
 793
 794 /*
 795  * Setup starting values.
 796  */
 797
 798  hlen = iph->ihl *4;
 799  left =ntohs(iph->tot_len) - hlen;/* Space per frame */
 800  mtu = rt->u.dst.pmtu - hlen;/* Size of data space */
 801  ptr = raw + hlen;/* Where to start from */
 802
 803 /*
 804  * Fragment the datagram.
 805  */
 806
 807  offset = (ntohs(iph->frag_off) & IP_OFFSET) <<3;
 808  not_last_frag = iph->frag_off &htons(IP_MF);
 809
 810 /*
 811  * Keep copying data until we run out.
 812  */
 813
 814 while(left >0) {
 815  len = left;
 816 /* IF: it doesn't fit, use 'mtu' - the data space left */
 817 if(len > mtu)
 818  len = mtu;
 819 /* IF: we are not sending upto and including the packet end
 820  then align the next start on an eight byte boundary */
 821 if(len < left) {
 822  len &= ~7;
 823 }
 824 /*
 825  * Allocate buffer.
 826  */
 827
 828 if((skb2 =alloc_skb(len+hlen+dev->hard_header_len+15,GFP_ATOMIC)) == NULL) {
 829 NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));
 830  err = -ENOMEM;
 831 goto fail;
 832 }
 833
 834 /*
 835  * Set up data on packet
 836  */
 837
 838  skb2->pkt_type = skb->pkt_type;
 839  skb2->priority = skb->priority;
 840 skb_reserve(skb2, (dev->hard_header_len+15)&~15);
 841 skb_put(skb2, len + hlen);
 842  skb2->nh.raw = skb2->data;
 843  skb2->h.raw = skb2->data + hlen;
 844
 845 /*
 846  * Charge the memory for the fragment to any owner
 847  * it might possess
 848  */
 849
 850 if(skb->sk)
 851 skb_set_owner_w(skb2, skb->sk);
 852  skb2->dst =dst_clone(skb->dst);
 853
 854 /*
 855  * Copy the packet header into the new buffer.
 856  */
 857
 858 memcpy(skb2->nh.raw, raw, hlen);
 859
 860 /*
 861  * Copy a block of the IP datagram.
 862  */
 863 memcpy(skb2->h.raw, ptr, len);
 864  left -= len;
 865
 866 /*
 867  * Fill in the new header fields.
 868  */
 869  iph = skb2->nh.iph;
 870  iph->frag_off =htons((offset >>3));
 871
 872 /* ANK: dirty, but effective trick. Upgrade options only if
 873  * the segment to be fragmented was THE FIRST (otherwise,
 874  * options are already fixed) and make it ONCE
 875  * on the initial skb, so that all the following fragments
 876  * will inherit fixed options.
 877  */
 878 if(offset ==0)
 879 ip_options_fragment(skb);
 880
 881 /*
 882  * Added AC : If we are fragmenting a fragment that's not the
 883  * last fragment then keep MF on each bit
 884  */
 885 if(left >0|| not_last_frag)
 886  iph->frag_off |=htons(IP_MF);
 887  ptr += len;
 888  offset += len;
 889
 890 /*
 891  * Put this fragment into the sending queue.
 892  */
 893
 894  ip_statistics.IpFragCreates++;
 895
 896  iph->tot_len =htons(len + hlen);
 897
 898 ip_send_check(iph);
 899
 900  err =output(skb2);
 901 if(err)
 902 goto fail;
 903 }
 904 kfree_skb(skb);
 905  ip_statistics.IpFragOKs++;
 906 return err;
 907
 908 fail:
 909 kfree_skb(skb);
 910  ip_statistics.IpFragFails++;
 911 return err;
 912 }
 913
 914 /*
 915  * Fetch data from kernel space and fill in checksum if needed.
 916  */
 917 static intip_reply_glue_bits(const void*dptr,char*to,unsigned int offset,
 918 unsigned int fraglen)
 919 {
 920 struct ip_reply_arg *dp = (struct ip_reply_arg*)dptr;
 921  u16 *pktp = (u16 *)to;
 922 struct iovec *iov;
 923 int len;
 924 int hdrflag =1;
 925
 926  iov = &dp->iov[0];
 927 if(offset >= iov->iov_len) {
 928  offset -= iov->iov_len;
 929  iov++;
 930  hdrflag =0;
 931 }
 932  len = iov->iov_len - offset;
 933 if(fraglen > len) {/* overlapping. */
 934  dp->csum =csum_partial_copy_nocheck(iov->iov_base+offset, to, len,
 935  dp->csum);
 936  offset =0;
 937  fraglen -= len;
 938  to += len;
 939  iov++;
 940 }
 941
 942  dp->csum =csum_partial_copy_nocheck(iov->iov_base+offset, to, fraglen,
 943  dp->csum);
 944
 945 if(hdrflag && dp->csumoffset)
 946 *(pktp + dp->csumoffset) =csum_fold(dp->csum);/* fill in checksum */
 947 return0;
 948 }
 949
 950 /*
 951  * Generic function to send a packet as reply to another packet.
 952  * Used to send TCP resets so far. ICMP should use this function too.
 953  *
 954  * Should run single threaded per socket because it uses the sock
 955  * structure to pass arguments.
 956  */
 957 voidip_send_reply(struct sock *sk,struct sk_buff *skb,struct ip_reply_arg *arg,
 958 unsigned int len)
 959 {
 960 struct{
 961 struct ip_options opt;
 962 char data[40];
 963 } replyopts;
 964 struct ipcm_cookie ipc;
 965  u32 daddr;
 966 struct rtable *rt = (struct rtable*)skb->dst;
 967
 968 if(ip_options_echo(&replyopts.opt, skb))
 969 return;
 970
 971  daddr = ipc.addr = rt->rt_src;
 972  ipc.opt = &replyopts.opt;
 973
 974 if(ipc.opt->srr)
 975  daddr = replyopts.opt.faddr;
 976 if(ip_route_output(&rt, daddr, rt->rt_spec_dst,RT_TOS(skb->nh.iph->tos),0))
 977 return;
 978
 979 /* And let IP do all the hard work.
 980
 981  This chunk is not reenterable, hence spinlock.
 982  Note that it uses the fact, that this function is called
 983  with locally disabled BH and that sk cannot be already spinlocked.
 984  */
 985 bh_lock_sock(sk);
 986  sk->protinfo.af_inet.tos = skb->nh.iph->tos;
 987  sk->priority = skb->priority;
 988  sk->protocol = skb->nh.iph->protocol;
 989 ip_build_xmit(sk, ip_reply_glue_bits, arg, len, &ipc, rt, MSG_DONTWAIT);
 990 bh_unlock_sock(sk);
 991
 992 ip_rt_put(rt);
 993 }
 994
 995 /*
 996  * IP protocol layer initialiser
 997  */
 998
 999 static struct packet_type ip_packet_type =
1000 {
1001 __constant_htons(ETH_P_IP),
1002  NULL,/* All devices */
1003  ip_rcv,
1004 (void*)1,
1005  NULL,
1006 };
1007
1008
1009
1010 #ifdef CONFIG_PROC_FS
1011 #ifdef CONFIG_IP_MULTICAST
1012 static struct proc_dir_entry proc_net_igmp = {
1013  PROC_NET_IGMP,4,"igmp",
1014  S_IFREG | S_IRUGO,1,0,0,
1015 0, &proc_net_inode_operations,
1016  ip_mc_procinfo
1017 };
1018 #endif
1019 #endif
1020
1021 /*
1022  * IP registers the packet type and then calls the subprotocol initialisers
1023  */
1024
1025 void __init ip_init(void)
1026 {
1027 dev_add_pack(&ip_packet_type);
1028
1029 ip_rt_init();
1030
1031 #ifdef CONFIG_PROC_FS
1032 #ifdef CONFIG_IP_MULTICAST
1033 proc_net_register(&proc_net_igmp);
1034 #endif
1035 #endif
1036 }
1037