root/mm/swap.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. show_swap_cache_info
  2. add_to_swap_cache
  3. init_swap_cache
  4. rw_swap_page
  5. get_swap_page
  6. swap_duplicate
  7. swap_free
  8. swap_in
  9. try_to_swap_out
  10. swap_out_pmd
  11. swap_out_pgd
  12. swap_out_vma
  13. swap_out_process
  14. swap_out
  15. try_to_free_page
  16. add_mem_queue
  17. remove_mem_queue
  18. free_pages_ok
  19. check_free_buffers
  20. free_pages
  21. mark_used
  22. __get_free_pages
  23. __get_dma_pages
  24. show_free_areas
  25. unuse_pte
  26. unuse_pmd
  27. unuse_pgd
  28. unuse_vma
  29. unuse_process
  30. try_to_unuse
  31. sys_swapoff
  32. sys_swapon
  33. si_swapinfo
  34. free_area_init

   1 /*
   2  *  linux/mm/swap.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  */
   6 
   7 /*
   8  * This file should contain most things doing the swapping from/to disk.
   9  * Started 18.12.91
  10  */
  11 
  12 #include <linux/mm.h>
  13 #include <linux/sched.h>
  14 #include <linux/head.h>
  15 #include <linux/kernel.h>
  16 #include <linux/kernel_stat.h>
  17 #include <linux/errno.h>
  18 #include <linux/string.h>
  19 #include <linux/stat.h>
  20 #include <linux/fs.h>
  21 
  22 #include <asm/dma.h>
  23 #include <asm/system.h> /* for cli()/sti() */
  24 #include <asm/bitops.h>
  25 #include <asm/pgtable.h>
  26 
  27 #define MAX_SWAPFILES 8
  28 
  29 #define SWP_USED        1
  30 #define SWP_WRITEOK     3
  31 
  32 int min_free_pages = 20;
  33 
  34 static int nr_swapfiles = 0;
  35 static struct wait_queue * lock_queue = NULL;
  36 
  37 static struct swap_info_struct {
  38         unsigned long flags;
  39         struct inode * swap_file;
  40         unsigned int swap_device;
  41         unsigned char * swap_map;
  42         unsigned char * swap_lockmap;
  43         int pages;
  44         int lowest_bit;
  45         int highest_bit;
  46         unsigned long max;
  47 } swap_info[MAX_SWAPFILES];
  48 
  49 extern int shm_swap (int);
  50 
  51 unsigned long *swap_cache;
  52 
  53 #ifdef SWAP_CACHE_INFO
  54 unsigned long swap_cache_add_total = 0;
  55 unsigned long swap_cache_add_success = 0;
  56 unsigned long swap_cache_del_total = 0;
  57 unsigned long swap_cache_del_success = 0;
  58 unsigned long swap_cache_find_total = 0;
  59 unsigned long swap_cache_find_success = 0;
  60 
  61 extern inline void show_swap_cache_info(void)
     /* [previous][next][first][last][top][bottom][index][help] */
  62 {
  63         printk("Swap cache: add %ld/%ld, delete %ld/%ld, find %ld/%ld\n",
  64                 swap_cache_add_total, swap_cache_add_success, 
  65                 swap_cache_del_total, swap_cache_del_success,
  66                 swap_cache_find_total, swap_cache_find_success);
  67 }
  68 #endif
  69 
  70 static int add_to_swap_cache(unsigned long addr, unsigned long entry)
     /* [previous][next][first][last][top][bottom][index][help] */
  71 {
  72         struct swap_info_struct * p = &swap_info[SWP_TYPE(entry)];
  73 
  74 #ifdef SWAP_CACHE_INFO
  75         swap_cache_add_total++;
  76 #endif
  77         if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
  78                 entry = (unsigned long) xchg_ptr(swap_cache + MAP_NR(addr), (void *) entry);
  79                 if (entry)  {
  80                         printk("swap_cache: replacing non-NULL entry\n");
  81                 }
  82 #ifdef SWAP_CACHE_INFO
  83                 swap_cache_add_success++;
  84 #endif
  85                 return 1;
  86         }
  87         return 0;
  88 }
  89 
  90 static unsigned long init_swap_cache(unsigned long mem_start,
     /* [previous][next][first][last][top][bottom][index][help] */
  91         unsigned long mem_end)
  92 {
  93         unsigned long swap_cache_size;
  94 
  95         mem_start = (mem_start + 15) & ~15;
  96         swap_cache = (unsigned long *) mem_start;
  97         swap_cache_size = MAP_NR(mem_end);
  98         memset(swap_cache, 0, swap_cache_size * sizeof (unsigned long));
  99         return (unsigned long) (swap_cache + swap_cache_size);
 100 }
 101 
 102 void rw_swap_page(int rw, unsigned long entry, char * buf)
     /* [previous][next][first][last][top][bottom][index][help] */
 103 {
 104         unsigned long type, offset;
 105         struct swap_info_struct * p;
 106 
 107         type = SWP_TYPE(entry);
 108         if (type >= nr_swapfiles) {
 109                 printk("Internal error: bad swap-device\n");
 110                 return;
 111         }
 112         p = &swap_info[type];
 113         offset = SWP_OFFSET(entry);
 114         if (offset >= p->max) {
 115                 printk("rw_swap_page: weirdness\n");
 116                 return;
 117         }
 118         if (p->swap_map && !p->swap_map[offset]) {
 119                 printk("Hmm.. Trying to use unallocated swap (%08lx)\n", entry);
 120                 return;
 121         }
 122         if (!(p->flags & SWP_USED)) {
 123                 printk("Trying to swap to unused swap-device\n");
 124                 return;
 125         }
 126         while (set_bit(offset,p->swap_lockmap))
 127                 sleep_on(&lock_queue);
 128         if (rw == READ)
 129                 kstat.pswpin++;
 130         else
 131                 kstat.pswpout++;
 132         if (p->swap_device) {
 133                 ll_rw_page(rw,p->swap_device,offset,buf);
 134         } else if (p->swap_file) {
 135                 struct inode *swapf = p->swap_file;
 136                 unsigned int zones[PAGE_SIZE/512];
 137                 int i;
 138                 if (swapf->i_op->bmap == NULL
 139                         && swapf->i_op->smap != NULL){
 140                         /*
 141                                 With MsDOS, we use msdos_smap which return
 142                                 a sector number (not a cluster or block number).
 143                                 It is a patch to enable the UMSDOS project.
 144                                 Other people are working on better solution.
 145 
 146                                 It sounds like ll_rw_swap_file defined
 147                                 it operation size (sector size) based on
 148                                 PAGE_SIZE and the number of block to read.
 149                                 So using bmap or smap should work even if
 150                                 smap will require more blocks.
 151                         */
 152                         int j;
 153                         unsigned int block = offset << 3;
 154 
 155                         for (i=0, j=0; j< PAGE_SIZE ; i++, j += 512){
 156                                 if (!(zones[i] = swapf->i_op->smap(swapf,block++))) {
 157                                         printk("rw_swap_page: bad swap file\n");
 158                                         return;
 159                                 }
 160                         }
 161                 }else{
 162                         int j;
 163                         unsigned int block = offset
 164                                 << (PAGE_SHIFT - swapf->i_sb->s_blocksize_bits);
 165 
 166                         for (i=0, j=0; j< PAGE_SIZE ; i++, j +=swapf->i_sb->s_blocksize)
 167                                 if (!(zones[i] = bmap(swapf,block++))) {
 168                                         printk("rw_swap_page: bad swap file\n");
 169                                         return;
 170                                 }
 171                 }
 172                 ll_rw_swap_file(rw,swapf->i_dev, zones, i,buf);
 173         } else
 174                 printk("re_swap_page: no swap file or device\n");
 175         if (offset && !clear_bit(offset,p->swap_lockmap))
 176                 printk("rw_swap_page: lock already cleared\n");
 177         wake_up(&lock_queue);
 178 }
 179 
 180 unsigned long get_swap_page(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 181 {
 182         struct swap_info_struct * p;
 183         unsigned long offset, type;
 184 
 185         p = swap_info;
 186         for (type = 0 ; type < nr_swapfiles ; type++,p++) {
 187                 if ((p->flags & SWP_WRITEOK) != SWP_WRITEOK)
 188                         continue;
 189                 for (offset = p->lowest_bit; offset <= p->highest_bit ; offset++) {
 190                         if (p->swap_map[offset])
 191                                 continue;
 192                         if (test_bit(offset, p->swap_lockmap))
 193                                 continue;
 194                         p->swap_map[offset] = 1;
 195                         nr_swap_pages--;
 196                         if (offset == p->highest_bit)
 197                                 p->highest_bit--;
 198                         p->lowest_bit = offset;
 199                         return SWP_ENTRY(type,offset);
 200                 }
 201         }
 202         return 0;
 203 }
 204 
 205 void swap_duplicate(unsigned long entry)
     /* [previous][next][first][last][top][bottom][index][help] */
 206 {
 207         struct swap_info_struct * p;
 208         unsigned long offset, type;
 209 
 210         if (!entry)
 211                 return;
 212         offset = SWP_OFFSET(entry);
 213         type = SWP_TYPE(entry);
 214         if (type == SHM_SWP_TYPE)
 215                 return;
 216         if (type >= nr_swapfiles) {
 217                 printk("Trying to duplicate nonexistent swap-page\n");
 218                 return;
 219         }
 220         p = type + swap_info;
 221         if (offset >= p->max) {
 222                 printk("swap_duplicate: weirdness\n");
 223                 return;
 224         }
 225         if (!p->swap_map[offset]) {
 226                 printk("swap_duplicate: trying to duplicate unused page\n");
 227                 return;
 228         }
 229         p->swap_map[offset]++;
 230         return;
 231 }
 232 
 233 void swap_free(unsigned long entry)
     /* [previous][next][first][last][top][bottom][index][help] */
 234 {
 235         struct swap_info_struct * p;
 236         unsigned long offset, type;
 237 
 238         if (!entry)
 239                 return;
 240         type = SWP_TYPE(entry);
 241         if (type == SHM_SWP_TYPE)
 242                 return;
 243         if (type >= nr_swapfiles) {
 244                 printk("Trying to free nonexistent swap-page\n");
 245                 return;
 246         }
 247         p = & swap_info[type];
 248         offset = SWP_OFFSET(entry);
 249         if (offset >= p->max) {
 250                 printk("swap_free: weirdness\n");
 251                 return;
 252         }
 253         if (!(p->flags & SWP_USED)) {
 254                 printk("Trying to free swap from unused swap-device\n");
 255                 return;
 256         }
 257         if (offset < p->lowest_bit)
 258                 p->lowest_bit = offset;
 259         if (offset > p->highest_bit)
 260                 p->highest_bit = offset;
 261         if (!p->swap_map[offset])
 262                 printk("swap_free: swap-space map bad (entry %08lx)\n",entry);
 263         else
 264                 if (!--p->swap_map[offset])
 265                         nr_swap_pages++;
 266 }
 267 
 268 /*
 269  * The tests may look silly, but it essentially makes sure that
 270  * no other process did a swap-in on us just as we were waiting.
 271  *
 272  * Also, don't bother to add to the swap cache if this page-in
 273  * was due to a write access.
 274  */
 275 void swap_in(struct vm_area_struct * vma, pte_t * page_table,
     /* [previous][next][first][last][top][bottom][index][help] */
 276         unsigned long entry, int write_access)
 277 {
 278         unsigned long page = get_free_page(GFP_KERNEL);
 279 
 280         if (pte_val(*page_table) != entry) {
 281                 free_page(page);
 282                 return;
 283         }
 284         if (!page) {
 285                 *page_table = BAD_PAGE;
 286                 swap_free(entry);
 287                 oom(current);
 288                 return;
 289         }
 290         read_swap_page(entry, (char *) page);
 291         if (pte_val(*page_table) != entry) {
 292                 free_page(page);
 293                 return;
 294         }
 295         vma->vm_task->mm->rss++;
 296         vma->vm_task->mm->maj_flt++;
 297         if (!write_access && add_to_swap_cache(page, entry)) {
 298                 *page_table = mk_pte(page, vma->vm_page_prot);
 299                 return;
 300         }
 301         *page_table = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 302         swap_free(entry);
 303         return;
 304 }
 305 
 306 /*
 307  * The swap-out functions return 1 if they successfully
 308  * threw something out, and we got a free page. It returns
 309  * zero if it couldn't do anything, and any other value
 310  * indicates it decreased rss, but the page was shared.
 311  *
 312  * NOTE! If it sleeps, it *must* return 1 to make sure we
 313  * don't continue with the swap-out. Otherwise we may be
 314  * using a process that no longer actually exists (it might
 315  * have died while we slept).
 316  */
 317 static inline int try_to_swap_out(struct vm_area_struct* vma, unsigned long address, pte_t * page_table)
     /* [previous][next][first][last][top][bottom][index][help] */
 318 {
 319         pte_t pte;
 320         unsigned long entry;
 321         unsigned long page;
 322 
 323         pte = *page_table;
 324         if (!pte_present(pte))
 325                 return 0;
 326         page = pte_page(pte);
 327         if (page >= high_memory)
 328                 return 0;
 329         if (mem_map[MAP_NR(page)] & MAP_PAGE_RESERVED)
 330                 return 0;
 331         if ((pte_dirty(pte) && delete_from_swap_cache(page)) || pte_young(pte))  {
 332                 *page_table = pte_mkold(pte);
 333                 return 0;
 334         }       
 335         if (pte_dirty(pte)) {
 336                 if (mem_map[MAP_NR(page)] != 1)
 337                         return 0;
 338                 if (vma->vm_ops && vma->vm_ops->swapout) {
 339                         vma->vm_task->mm->rss--;
 340                         vma->vm_ops->swapout(vma, address-vma->vm_start, page_table);
 341                 } else {
 342                         if (!(entry = get_swap_page()))
 343                                 return 0;
 344                         vma->vm_task->mm->rss--;
 345                         pte_val(*page_table) = entry;
 346                         invalidate();
 347                         write_swap_page(entry, (char *) page);
 348                 }
 349                 free_page(page);
 350                 return 1;       /* we slept: the process may not exist any more */
 351         }
 352         if ((entry = find_in_swap_cache(page)))  {
 353                 if (mem_map[MAP_NR(page)] != 1) {
 354                         *page_table = pte_mkdirty(pte);
 355                         printk("Aiee.. duplicated cached swap-cache entry\n");
 356                         return 0;
 357                 }
 358                 vma->vm_task->mm->rss--;
 359                 pte_val(*page_table) = entry;
 360                 invalidate();
 361                 free_page(page);
 362                 return 1;
 363         } 
 364         vma->vm_task->mm->rss--;
 365         pte_clear(page_table);
 366         invalidate();
 367         entry = mem_map[MAP_NR(page)];
 368         free_page(page);
 369         return entry;
 370 }
 371 
 372 /*
 373  * A new implementation of swap_out().  We do not swap complete processes,
 374  * but only a small number of blocks, before we continue with the next
 375  * process.  The number of blocks actually swapped is determined on the
 376  * number of page faults, that this process actually had in the last time,
 377  * so we won't swap heavily used processes all the time ...
 378  *
 379  * Note: the priority argument is a hint on much CPU to waste with the
 380  *       swap block search, not a hint, of how much blocks to swap with
 381  *       each process.
 382  *
 383  * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
 384  */
 385 
 386 /*
 387  * These are the minimum and maximum number of pages to swap from one process,
 388  * before proceeding to the next:
 389  */
 390 #define SWAP_MIN        4
 391 #define SWAP_MAX        32
 392 
 393 /*
 394  * The actual number of pages to swap is determined as:
 395  * SWAP_RATIO / (number of recent major page faults)
 396  */
 397 #define SWAP_RATIO      128
 398 
 399 static inline int swap_out_pmd(struct vm_area_struct * vma, pmd_t *dir,
     /* [previous][next][first][last][top][bottom][index][help] */
 400         unsigned long address, unsigned long end)
 401 {
 402         pte_t * pte;
 403         unsigned long pmd_end;
 404 
 405         if (pmd_none(*dir))
 406                 return 0;
 407         if (pmd_bad(*dir)) {
 408                 printk("swap_out_pmd: bad pmd (%08lx)\n", pmd_val(*dir));
 409                 pmd_clear(dir);
 410                 return 0;
 411         }
 412         
 413         pte = pte_offset(dir, address);
 414         
 415         pmd_end = (address + PMD_SIZE) & PMD_MASK;
 416         if (end > pmd_end)
 417                 end = pmd_end;
 418 
 419         do {
 420                 int result;
 421                 vma->vm_task->mm->swap_address = address + PAGE_SIZE;
 422                 result = try_to_swap_out(vma, address, pte);
 423                 if (result)
 424                         return result;
 425                 address += PAGE_SIZE;
 426                 pte++;
 427         } while (address < end);
 428         return 0;
 429 }
 430 
 431 static inline int swap_out_pgd(struct vm_area_struct * vma, pgd_t *dir,
     /* [previous][next][first][last][top][bottom][index][help] */
 432         unsigned long address, unsigned long end)
 433 {
 434         pmd_t * pmd;
 435         unsigned long pgd_end;
 436 
 437         if (pgd_none(*dir))
 438                 return 0;
 439         if (pgd_bad(*dir)) {
 440                 printk("swap_out_pgd: bad pgd (%08lx)\n", pgd_val(*dir));
 441                 pgd_clear(dir);
 442                 return 0;
 443         }
 444 
 445         pmd = pmd_offset(dir, address);
 446 
 447         pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;  
 448         if (end > pgd_end)
 449                 end = pgd_end;
 450         
 451         do {
 452                 int result = swap_out_pmd(vma, pmd, address, end);
 453                 if (result)
 454                         return result;
 455                 address = (address + PMD_SIZE) & PMD_MASK;
 456                 pmd++;
 457         } while (address < end);
 458         return 0;
 459 }
 460 
 461 static int swap_out_vma(struct vm_area_struct * vma, pgd_t *pgdir,
     /* [previous][next][first][last][top][bottom][index][help] */
 462         unsigned long start)
 463 {
 464         unsigned long end;
 465 
 466         /* Don't swap out areas like shared memory which have their
 467             own separate swapping mechanism. */
 468         if (vma->vm_flags & VM_SHM)
 469                 return 0;
 470 
 471         end = vma->vm_end;
 472         while (start < end) {
 473                 int result = swap_out_pgd(vma, pgdir, start, end);
 474                 if (result)
 475                         return result;
 476                 start = (start + PGDIR_SIZE) & PGDIR_MASK;
 477                 pgdir++;
 478         }
 479         return 0;
 480 }
 481 
 482 static int swap_out_process(struct task_struct * p)
     /* [previous][next][first][last][top][bottom][index][help] */
 483 {
 484         unsigned long address;
 485         struct vm_area_struct* vma;
 486 
 487         /*
 488          * Go through process' page directory.
 489          */
 490         address = p->mm->swap_address;
 491         p->mm->swap_address = 0;
 492 
 493         /*
 494          * Find the proper vm-area
 495          */
 496         vma = find_vma(p, address);
 497         if (!vma)
 498                 return 0;
 499         if (address < vma->vm_start)
 500                 address = vma->vm_start;
 501 
 502         for (;;) {
 503                 int result = swap_out_vma(vma, pgd_offset(p, address), address);
 504                 if (result)
 505                         return result;
 506                 vma = vma->vm_next;
 507                 if (!vma)
 508                         break;
 509                 address = vma->vm_start;
 510         }
 511         p->mm->swap_address = 0;
 512         return 0;
 513 }
 514 
 515 static int swap_out(unsigned int priority)
     /* [previous][next][first][last][top][bottom][index][help] */
 516 {
 517         static int swap_task;
 518         int loop, counter;
 519         struct task_struct *p;
 520 
 521         counter = 6*nr_tasks >> priority;
 522         for(; counter >= 0; counter--) {
 523                 /*
 524                  * Check that swap_task is suitable for swapping.  If not, look for
 525                  * the next suitable process.
 526                  */
 527                 loop = 0;
 528                 while(1) {
 529                         if (swap_task >= NR_TASKS) {
 530                                 swap_task = 1;
 531                                 if (loop)
 532                                         /* all processes are unswappable or already swapped out */
 533                                         return 0;
 534                                 loop = 1;
 535                         }
 536 
 537                         p = task[swap_task];
 538                         if (p && p->mm->swappable && p->mm->rss)
 539                                 break;
 540 
 541                         swap_task++;
 542                 }
 543 
 544                 /*
 545                  * Determine the number of pages to swap from this process.
 546                  */
 547                 if (!p->mm->swap_cnt) {
 548                         p->mm->dec_flt = (p->mm->dec_flt * 3) / 4 + p->mm->maj_flt - p->mm->old_maj_flt;
 549                         p->mm->old_maj_flt = p->mm->maj_flt;
 550 
 551                         if (p->mm->dec_flt >= SWAP_RATIO / SWAP_MIN) {
 552                                 p->mm->dec_flt = SWAP_RATIO / SWAP_MIN;
 553                                 p->mm->swap_cnt = SWAP_MIN;
 554                         } else if (p->mm->dec_flt <= SWAP_RATIO / SWAP_MAX)
 555                                 p->mm->swap_cnt = SWAP_MAX;
 556                         else
 557                                 p->mm->swap_cnt = SWAP_RATIO / p->mm->dec_flt;
 558                 }
 559                 if (!--p->mm->swap_cnt)
 560                         swap_task++;
 561                 switch (swap_out_process(p)) {
 562                         case 0:
 563                                 if (p->mm->swap_cnt)
 564                                         swap_task++;
 565                                 break;
 566                         case 1:
 567                                 return 1;
 568                         default:
 569                                 break;
 570                 }
 571         }
 572         return 0;
 573 }
 574 
 575 /*
 576  * we keep on shrinking one resource until it's considered "too hard",
 577  * and then switch to the next one (priority being an indication on how
 578  * hard we should try with the resource).
 579  *
 580  * This should automatically find the resource that can most easily be
 581  * free'd, so hopefully we'll get reasonable behaviour even under very
 582  * different circumstances.
 583  */
 584 static int try_to_free_page(int priority)
     /* [previous][next][first][last][top][bottom][index][help] */
 585 {
 586         static int state = 0;
 587         int i=6;
 588 
 589         switch (state) {
 590                 do {
 591                 case 0:
 592                         if (priority != GFP_NOBUFFER && shrink_buffers(i))
 593                                 return 1;
 594                         state = 1;
 595                 case 1:
 596                         if (shm_swap(i))
 597                                 return 1;
 598                         state = 2;
 599                 default:
 600                         if (swap_out(i))
 601                                 return 1;
 602                         state = 0;
 603                 } while(i--);
 604         }
 605         return 0;
 606 }
 607 
 608 static inline void add_mem_queue(struct mem_list * head, struct mem_list * entry)
     /* [previous][next][first][last][top][bottom][index][help] */
 609 {
 610         entry->prev = head;
 611         (entry->next = head->next)->prev = entry;
 612         head->next = entry;
 613 }
 614 
 615 static inline void remove_mem_queue(struct mem_list * head, struct mem_list * entry)
     /* [previous][next][first][last][top][bottom][index][help] */
 616 {
 617         entry->next->prev = entry->prev;
 618         entry->prev->next = entry->next;
 619 }
 620 
 621 /*
 622  * Free_page() adds the page to the free lists. This is optimized for
 623  * fast normal cases (no error jumps taken normally).
 624  *
 625  * The way to optimize jumps for gcc-2.2.2 is to:
 626  *  - select the "normal" case and put it inside the if () { XXX }
 627  *  - no else-statements if you can avoid them
 628  *
 629  * With the above two rules, you get a straight-line execution path
 630  * for the normal case, giving better asm-code.
 631  *
 632  * free_page() may sleep since the page being freed may be a buffer
 633  * page or present in the swap cache. It will not sleep, however,
 634  * for a freshly allocated page (get_free_page()).
 635  */
 636 
 637 /*
 638  * Buddy system. Hairy. You really aren't expected to understand this
 639  */
 640 static inline void free_pages_ok(unsigned long addr, unsigned long order)
     /* [previous][next][first][last][top][bottom][index][help] */
 641 {
 642         unsigned long index = MAP_NR(addr) >> (1 + order);
 643         unsigned long mask = PAGE_MASK << order;
 644 
 645         addr &= mask;
 646         nr_free_pages += 1 << order;
 647         while (order < NR_MEM_LISTS-1) {
 648                 if (!change_bit(index, free_area_map[order]))
 649                         break;
 650                 remove_mem_queue(free_area_list+order, (struct mem_list *) (addr ^ (1+~mask)));
 651                 order++;
 652                 index >>= 1;
 653                 mask <<= 1;
 654                 addr &= mask;
 655         }
 656         add_mem_queue(free_area_list+order, (struct mem_list *) addr);
 657 }
 658 
 659 static inline void check_free_buffers(unsigned long addr)
     /* [previous][next][first][last][top][bottom][index][help] */
 660 {
 661         struct buffer_head * bh;
 662 
 663         bh = buffer_pages[MAP_NR(addr)];
 664         if (bh) {
 665                 struct buffer_head *tmp = bh;
 666                 do {
 667                         if (tmp->b_list == BUF_SHARED && tmp->b_dev != 0xffff)
 668                                 refile_buffer(tmp);
 669                         tmp = tmp->b_this_page;
 670                 } while (tmp != bh);
 671         }
 672 }
 673 
 674 void free_pages(unsigned long addr, unsigned long order)
     /* [previous][next][first][last][top][bottom][index][help] */
 675 {
 676         if (addr < high_memory) {
 677                 unsigned long flag;
 678                 mem_map_t * map = mem_map + MAP_NR(addr);
 679                 if (*map) {
 680                         if (!(*map & MAP_PAGE_RESERVED)) {
 681                                 save_flags(flag);
 682                                 cli();
 683                                 if (!--*map)  {
 684                                         free_pages_ok(addr, order);
 685                                         delete_from_swap_cache(addr);
 686                                 }
 687                                 restore_flags(flag);
 688                                 if (*map == 1)
 689                                         check_free_buffers(addr);
 690                         }
 691                         return;
 692                 }
 693                 printk("Trying to free free memory (%08lx): memory probably corrupted\n",addr);
 694                 printk("PC = %p\n", __builtin_return_address(0));
 695                 return;
 696         }
 697 }
 698 
 699 /*
 700  * Some ugly macros to speed up __get_free_pages()..
 701  */
 702 #define RMQUEUE(order) \
 703 do { struct mem_list * queue = free_area_list+order; \
 704      unsigned long new_order = order; \
 705         do { struct mem_list *next = queue->next; \
 706                 if (queue != next) { \
 707                         (queue->next = next->next)->prev = queue; \
 708                         mark_used((unsigned long) next, new_order); \
 709                         nr_free_pages -= 1 << order; \
 710                         restore_flags(flags); \
 711                         EXPAND(next, order, new_order); \
 712                         return (unsigned long) next; \
 713                 } new_order++; queue++; \
 714         } while (new_order < NR_MEM_LISTS); \
 715 } while (0)
 716 
 717 static inline int mark_used(unsigned long addr, unsigned long order)
     /* [previous][next][first][last][top][bottom][index][help] */
 718 {
 719         return change_bit(MAP_NR(addr) >> (1+order), free_area_map[order]);
 720 }
 721 
 722 #define EXPAND(addr,low,high) \
 723 do { unsigned long size = PAGE_SIZE << high; \
 724         while (high > low) { \
 725                 high--; size >>= 1; cli(); \
 726                 add_mem_queue(free_area_list+high, addr); \
 727                 mark_used((unsigned long) addr, high); \
 728                 restore_flags(flags); \
 729                 addr = (struct mem_list *) (size + (unsigned long) addr); \
 730         } mem_map[MAP_NR((unsigned long) addr)] = 1; \
 731 } while (0)
 732 
 733 unsigned long __get_free_pages(int priority, unsigned long order)
     /* [previous][next][first][last][top][bottom][index][help] */
 734 {
 735         unsigned long flags;
 736         int reserved_pages;
 737 
 738         if (intr_count && priority != GFP_ATOMIC) {
 739                 static int count = 0;
 740                 if (++count < 5) {
 741                         printk("gfp called nonatomically from interrupt %p\n",
 742                                 __builtin_return_address(0));
 743                         priority = GFP_ATOMIC;
 744                 }
 745         }
 746         reserved_pages = 5;
 747         if (priority != GFP_NFS)
 748                 reserved_pages = min_free_pages;
 749         save_flags(flags);
 750 repeat:
 751         cli();
 752         if ((priority==GFP_ATOMIC) || nr_free_pages > reserved_pages) {
 753                 RMQUEUE(order);
 754                 restore_flags(flags);
 755                 return 0;
 756         }
 757         restore_flags(flags);
 758         if (priority != GFP_BUFFER && try_to_free_page(priority))
 759                 goto repeat;
 760         return 0;
 761 }
 762 
 763 /*
 764  * Yes, I know this is ugly. Don't tell me.
 765  */
 766 unsigned long __get_dma_pages(int priority, unsigned long order)
     /* [previous][next][first][last][top][bottom][index][help] */
 767 {
 768         unsigned long list = 0;
 769         unsigned long result;
 770         unsigned long limit = MAX_DMA_ADDRESS;
 771 
 772         /* if (EISA_bus) limit = ~0UL; */
 773         if (priority != GFP_ATOMIC)
 774                 priority = GFP_BUFFER;
 775         for (;;) {
 776                 result = __get_free_pages(priority, order);
 777                 if (result < limit) /* covers failure as well */
 778                         break;
 779                 *(unsigned long *) result = list;
 780                 list = result;
 781         }
 782         while (list) {
 783                 unsigned long tmp = list;
 784                 list = *(unsigned long *) list;
 785                 free_pages(tmp, order);
 786         }
 787         return result;
 788 }
 789 
 790 /*
 791  * Show free area list (used inside shift_scroll-lock stuff)
 792  * We also calculate the percentage fragmentation. We do this by counting the
 793  * memory on each free list with the exception of the first item on the list.
 794  */
 795 void show_free_areas(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 796 {
 797         unsigned long order, flags;
 798         unsigned long total = 0;
 799 
 800         printk("Free pages:      %6dkB\n ( ",nr_free_pages<<(PAGE_SHIFT-10));
 801         save_flags(flags);
 802         cli();
 803         for (order=0 ; order < NR_MEM_LISTS; order++) {
 804                 struct mem_list * tmp;
 805                 unsigned long nr = 0;
 806                 for (tmp = free_area_list[order].next ; tmp != free_area_list + order ; tmp = tmp->next) {
 807                         nr ++;
 808                 }
 809                 total += nr * ((PAGE_SIZE>>10) << order);
 810                 printk("%lu*%lukB ", nr, (PAGE_SIZE>>10) << order);
 811         }
 812         restore_flags(flags);
 813         printk("= %lukB)\n", total);
 814 #ifdef SWAP_CACHE_INFO
 815         show_swap_cache_info();
 816 #endif  
 817 }
 818 
 819 /*
 820  * Trying to stop swapping from a file is fraught with races, so
 821  * we repeat quite a bit here when we have to pause. swapoff()
 822  * isn't exactly timing-critical, so who cares (but this is /really/
 823  * inefficient, ugh).
 824  *
 825  * We return 1 after having slept, which makes the process start over
 826  * from the beginning for this process..
 827  */
 828 static inline int unuse_pte(struct vm_area_struct * vma, unsigned long address,
     /* [previous][next][first][last][top][bottom][index][help] */
 829         pte_t *dir, unsigned int type, unsigned long page)
 830 {
 831         pte_t pte = *dir;
 832 
 833         if (pte_none(pte))
 834                 return 0;
 835         if (pte_present(pte)) {
 836                 unsigned long page = pte_page(pte);
 837                 if (page >= high_memory)
 838                         return 0;
 839                 if (!in_swap_cache(page))
 840                         return 0;
 841                 if (SWP_TYPE(in_swap_cache(page)) != type)
 842                         return 0;
 843                 delete_from_swap_cache(page);
 844                 *dir = pte_mkdirty(pte);
 845                 return 0;
 846         }
 847         if (SWP_TYPE(pte_val(pte)) != type)
 848                 return 0;
 849         read_swap_page(pte_val(pte), (char *) page);
 850         if (pte_val(*dir) != pte_val(pte)) {
 851                 free_page(page);
 852                 return 1;
 853         }
 854         *dir = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 855         ++vma->vm_task->mm->rss;
 856         swap_free(pte_val(pte));
 857         return 1;
 858 }
 859 
 860 static inline int unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
     /* [previous][next][first][last][top][bottom][index][help] */
 861         unsigned long address, unsigned long size, unsigned long offset,
 862         unsigned int type, unsigned long page)
 863 {
 864         pte_t * pte;
 865         unsigned long end;
 866 
 867         if (pmd_none(*dir))
 868                 return 0;
 869         if (pmd_bad(*dir)) {
 870                 printk("unuse_pmd: bad pmd (%08lx)\n", pmd_val(*dir));
 871                 pmd_clear(dir);
 872                 return 0;
 873         }
 874         pte = pte_offset(dir, address);
 875         offset += address & PMD_MASK;
 876         address &= ~PMD_MASK;
 877         end = address + size;
 878         if (end > PMD_SIZE)
 879                 end = PMD_SIZE;
 880         do {
 881                 if (unuse_pte(vma, offset+address-vma->vm_start, pte, type, page))
 882                         return 1;
 883                 address += PAGE_SIZE;
 884                 pte++;
 885         } while (address < end);
 886         return 0;
 887 }
 888 
 889 static inline int unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
     /* [previous][next][first][last][top][bottom][index][help] */
 890         unsigned long address, unsigned long size,
 891         unsigned int type, unsigned long page)
 892 {
 893         pmd_t * pmd;
 894         unsigned long offset, end;
 895 
 896         if (pgd_none(*dir))
 897                 return 0;
 898         if (pgd_bad(*dir)) {
 899                 printk("unuse_pgd: bad pgd (%08lx)\n", pgd_val(*dir));
 900                 pgd_clear(dir);
 901                 return 0;
 902         }
 903         pmd = pmd_offset(dir, address);
 904         offset = address & PGDIR_MASK;
 905         address &= ~PGDIR_MASK;
 906         end = address + size;
 907         if (end > PGDIR_SIZE)
 908                 end = PGDIR_SIZE;
 909         do {
 910                 if (unuse_pmd(vma, pmd, address, end - address, offset, type, page))
 911                         return 1;
 912                 address = (address + PMD_SIZE) & PMD_MASK;
 913                 pmd++;
 914         } while (address < end);
 915         return 0;
 916 }
 917 
 918 static int unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir,
     /* [previous][next][first][last][top][bottom][index][help] */
 919         unsigned long start, unsigned long end,
 920         unsigned int type, unsigned long page)
 921 {
 922         while (start < end) {
 923                 if (unuse_pgd(vma, pgdir, start, end - start, type, page))
 924                         return 1;
 925                 start = (start + PGDIR_SIZE) & PGDIR_MASK;
 926                 pgdir++;
 927         }
 928         return 0;
 929 }
 930 
 931 static int unuse_process(struct task_struct * p, unsigned int type, unsigned long page)
     /* [previous][next][first][last][top][bottom][index][help] */
 932 {
 933         struct vm_area_struct* vma;
 934 
 935         /*
 936          * Go through process' page directory.
 937          */
 938         vma = p->mm->mmap;
 939         while (vma) {
 940                 pgd_t * pgd = pgd_offset(p, vma->vm_start);
 941                 if (unuse_vma(vma, pgd, vma->vm_start, vma->vm_end, type, page))
 942                         return 1;
 943                 vma = vma->vm_next;
 944         }
 945         return 0;
 946 }
 947 
 948 /*
 949  * To avoid races, we repeat for each process after having
 950  * swapped something in. That gets rid of a few pesky races,
 951  * and "swapoff" isn't exactly timing critical.
 952  */
 953 static int try_to_unuse(unsigned int type)
     /* [previous][next][first][last][top][bottom][index][help] */
 954 {
 955         int nr;
 956         unsigned long page = get_free_page(GFP_KERNEL);
 957 
 958         if (!page)
 959                 return -ENOMEM;
 960         nr = 0;
 961         while (nr < NR_TASKS) {
 962                 if (task[nr]) {
 963                         if (unuse_process(task[nr], type, page)) {
 964                                 page = get_free_page(GFP_KERNEL);
 965                                 if (!page)
 966                                         return -ENOMEM;
 967                                 continue;
 968                         }
 969                 }
 970                 nr++;
 971         }
 972         free_page(page);
 973         return 0;
 974 }
 975 
 976 asmlinkage int sys_swapoff(const char * specialfile)
     /* [previous][next][first][last][top][bottom][index][help] */
 977 {
 978         struct swap_info_struct * p;
 979         struct inode * inode;
 980         unsigned int type;
 981         struct file filp;
 982         int i;
 983 
 984         if (!suser())
 985                 return -EPERM;
 986         i = namei(specialfile,&inode);
 987         if (i)
 988                 return i;
 989         p = swap_info;
 990         for (type = 0 ; type < nr_swapfiles ; type++,p++) {
 991                 if ((p->flags & SWP_WRITEOK) != SWP_WRITEOK)
 992                         continue;
 993                 if (p->swap_file) {
 994                         if (p->swap_file == inode)
 995                                 break;
 996                 } else {
 997                         if (!S_ISBLK(inode->i_mode))
 998                                 continue;
 999                         if (p->swap_device == inode->i_rdev)
1000                                 break;
1001                 }
1002         }
1003 
1004         if (type >= nr_swapfiles){
1005                 iput(inode);
1006                 return -EINVAL;
1007         }
1008         p->flags = SWP_USED;
1009         i = try_to_unuse(type);
1010         if (i) {
1011                 iput(inode);
1012                 p->flags = SWP_WRITEOK;
1013                 return i;
1014         }
1015 
1016         if(p->swap_device){
1017                 memset(&filp, 0, sizeof(filp));         
1018                 filp.f_inode = inode;
1019                 filp.f_mode = 3; /* read write */
1020                 /* open it again to get fops */
1021                 if( !blkdev_open(inode, &filp) &&
1022                    filp.f_op && filp.f_op->release){
1023                         filp.f_op->release(inode,&filp);
1024                         filp.f_op->release(inode,&filp);
1025                 }
1026         }
1027         iput(inode);
1028 
1029         nr_swap_pages -= p->pages;
1030         iput(p->swap_file);
1031         p->swap_file = NULL;
1032         p->swap_device = 0;
1033         vfree(p->swap_map);
1034         p->swap_map = NULL;
1035         free_page((long) p->swap_lockmap);
1036         p->swap_lockmap = NULL;
1037         p->flags = 0;
1038         return 0;
1039 }
1040 
1041 /*
1042  * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
1043  *
1044  * The swapon system call
1045  */
1046 asmlinkage int sys_swapon(const char * specialfile)
     /* [previous][next][first][last][top][bottom][index][help] */
1047 {
1048         struct swap_info_struct * p;
1049         struct inode * swap_inode;
1050         unsigned int type;
1051         int i,j;
1052         int error;
1053         struct file filp;
1054 
1055         memset(&filp, 0, sizeof(filp));
1056         if (!suser())
1057                 return -EPERM;
1058         p = swap_info;
1059         for (type = 0 ; type < nr_swapfiles ; type++,p++)
1060                 if (!(p->flags & SWP_USED))
1061                         break;
1062         if (type >= MAX_SWAPFILES)
1063                 return -EPERM;
1064         if (type >= nr_swapfiles)
1065                 nr_swapfiles = type+1;
1066         p->flags = SWP_USED;
1067         p->swap_file = NULL;
1068         p->swap_device = 0;
1069         p->swap_map = NULL;
1070         p->swap_lockmap = NULL;
1071         p->lowest_bit = 0;
1072         p->highest_bit = 0;
1073         p->max = 1;
1074         error = namei(specialfile,&swap_inode);
1075         if (error)
1076                 goto bad_swap_2;
1077         p->swap_file = swap_inode;
1078         error = -EBUSY;
1079         if (swap_inode->i_count != 1)
1080                 goto bad_swap_2;
1081         error = -EINVAL;
1082 
1083         if (S_ISBLK(swap_inode->i_mode)) {
1084                 p->swap_device = swap_inode->i_rdev;
1085 
1086                 filp.f_inode = swap_inode;
1087                 filp.f_mode = 3; /* read write */
1088                 error = blkdev_open(swap_inode, &filp);
1089                 p->swap_file = NULL;
1090                 iput(swap_inode);
1091                 if(error)
1092                         goto bad_swap_2;
1093                 error = -ENODEV;
1094                 if (!p->swap_device)
1095                         goto bad_swap;
1096                 error = -EBUSY;
1097                 for (i = 0 ; i < nr_swapfiles ; i++) {
1098                         if (i == type)
1099                                 continue;
1100                         if (p->swap_device == swap_info[i].swap_device)
1101                                 goto bad_swap;
1102                 }
1103         } else if (!S_ISREG(swap_inode->i_mode))
1104                 goto bad_swap;
1105         p->swap_lockmap = (unsigned char *) get_free_page(GFP_USER);
1106         if (!p->swap_lockmap) {
1107                 printk("Unable to start swapping: out of memory :-)\n");
1108                 error = -ENOMEM;
1109                 goto bad_swap;
1110         }
1111         read_swap_page(SWP_ENTRY(type,0), (char *) p->swap_lockmap);
1112         if (memcmp("SWAP-SPACE",p->swap_lockmap+PAGE_SIZE-10,10)) {
1113                 printk("Unable to find swap-space signature\n");
1114                 error = -EINVAL;
1115                 goto bad_swap;
1116         }
1117         memset(p->swap_lockmap+PAGE_SIZE-10,0,10);
1118         j = 0;
1119         p->lowest_bit = 0;
1120         p->highest_bit = 0;
1121         for (i = 1 ; i < 8*PAGE_SIZE ; i++) {
1122                 if (test_bit(i,p->swap_lockmap)) {
1123                         if (!p->lowest_bit)
1124                                 p->lowest_bit = i;
1125                         p->highest_bit = i;
1126                         p->max = i+1;
1127                         j++;
1128                 }
1129         }
1130         if (!j) {
1131                 printk("Empty swap-file\n");
1132                 error = -EINVAL;
1133                 goto bad_swap;
1134         }
1135         p->swap_map = (unsigned char *) vmalloc(p->max);
1136         if (!p->swap_map) {
1137                 error = -ENOMEM;
1138                 goto bad_swap;
1139         }
1140         for (i = 1 ; i < p->max ; i++) {
1141                 if (test_bit(i,p->swap_lockmap))
1142                         p->swap_map[i] = 0;
1143                 else
1144                         p->swap_map[i] = 0x80;
1145         }
1146         p->swap_map[0] = 0x80;
1147         memset(p->swap_lockmap,0,PAGE_SIZE);
1148         p->flags = SWP_WRITEOK;
1149         p->pages = j;
1150         nr_swap_pages += j;
1151         printk("Adding Swap: %dk swap-space\n",j<<(PAGE_SHIFT-10));
1152         return 0;
1153 bad_swap:
1154         if(filp.f_op && filp.f_op->release)
1155                 filp.f_op->release(filp.f_inode,&filp);
1156 bad_swap_2:
1157         free_page((long) p->swap_lockmap);
1158         vfree(p->swap_map);
1159         iput(p->swap_file);
1160         p->swap_device = 0;
1161         p->swap_file = NULL;
1162         p->swap_map = NULL;
1163         p->swap_lockmap = NULL;
1164         p->flags = 0;
1165         return error;
1166 }
1167 
1168 void si_swapinfo(struct sysinfo *val)
     /* [previous][next][first][last][top][bottom][index][help] */
1169 {
1170         unsigned int i, j;
1171 
1172         val->freeswap = val->totalswap = 0;
1173         for (i = 0; i < nr_swapfiles; i++) {
1174                 if ((swap_info[i].flags & SWP_WRITEOK) != SWP_WRITEOK)
1175                         continue;
1176                 for (j = 0; j < swap_info[i].max; ++j)
1177                         switch (swap_info[i].swap_map[j]) {
1178                                 case 128:
1179                                         continue;
1180                                 case 0:
1181                                         ++val->freeswap;
1182                                 default:
1183                                         ++val->totalswap;
1184                         }
1185         }
1186         val->freeswap <<= PAGE_SHIFT;
1187         val->totalswap <<= PAGE_SHIFT;
1188         return;
1189 }
1190 
1191 /*
1192  * set up the free-area data structures:
1193  *   - mark all pages MAP_PAGE_RESERVED
1194  *   - mark all memory queues empty
1195  *   - clear the memory bitmaps
1196  */
1197 unsigned long free_area_init(unsigned long start_mem, unsigned long end_mem)
     /* [previous][next][first][last][top][bottom][index][help] */
1198 {
1199         mem_map_t * p;
1200         unsigned long mask = PAGE_MASK;
1201         int i;
1202 
1203         /*
1204          * select nr of pages we try to keep free for important stuff
1205          * with a minimum of 16 pages. This is totally arbitrary
1206          */
1207         i = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT+6);
1208         if (i < 16)
1209                 i = 16;
1210         min_free_pages = i;
1211         start_mem = init_swap_cache(start_mem, end_mem);
1212         mem_map = (mem_map_t *) start_mem;
1213         p = mem_map + MAP_NR(end_mem);
1214         start_mem = (unsigned long) p;
1215         while (p > mem_map)
1216                 *--p = MAP_PAGE_RESERVED;
1217 
1218         for (i = 0 ; i < NR_MEM_LISTS ; i++) {
1219                 unsigned long bitmap_size;
1220                 free_area_list[i].prev = free_area_list[i].next = &free_area_list[i];
1221                 mask += mask;
1222                 end_mem = (end_mem + ~mask) & mask;
1223                 bitmap_size = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT + i);
1224                 bitmap_size = (bitmap_size + 7) >> 3;
1225                 bitmap_size = (bitmap_size + sizeof(unsigned long) - 1) & ~(sizeof(unsigned long)-1);
1226                 free_area_map[i] = (unsigned char *) start_mem;
1227                 memset((void *) start_mem, 0, bitmap_size);
1228                 start_mem += bitmap_size;
1229         }
1230         return start_mem;
1231 }

/* [previous][next][first][last][top][bottom][index][help] */