root/mm/swap.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. show_swap_cache_info
  2. add_to_swap_cache
  3. init_swap_cache
  4. rw_swap_page
  5. get_swap_page
  6. swap_duplicate
  7. swap_free
  8. swap_in
  9. try_to_swap_out
  10. swap_out_pmd
  11. swap_out_pgd
  12. swap_out_vma
  13. swap_out_process
  14. swap_out
  15. try_to_free_page
  16. add_mem_queue
  17. remove_mem_queue
  18. free_pages_ok
  19. check_free_buffers
  20. free_pages
  21. mark_used
  22. __get_free_pages
  23. __get_dma_pages
  24. show_free_areas
  25. unuse_pte
  26. unuse_pmd
  27. unuse_pgd
  28. unuse_vma
  29. unuse_process
  30. try_to_unuse
  31. sys_swapoff
  32. sys_swapon
  33. si_swapinfo
  34. free_area_init

   1 /*
   2  *  linux/mm/swap.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  */
   6 
   7 /*
   8  * This file should contain most things doing the swapping from/to disk.
   9  * Started 18.12.91
  10  */
  11 
  12 #include <linux/mm.h>
  13 #include <linux/sched.h>
  14 #include <linux/head.h>
  15 #include <linux/kernel.h>
  16 #include <linux/kernel_stat.h>
  17 #include <linux/errno.h>
  18 #include <linux/string.h>
  19 #include <linux/stat.h>
  20 #include <linux/swap.h>
  21 #include <linux/fs.h>
  22 
  23 #include <asm/dma.h>
  24 #include <asm/system.h> /* for cli()/sti() */
  25 #include <asm/bitops.h>
  26 #include <asm/pgtable.h>
  27 
  28 #define MAX_SWAPFILES 8
  29 
  30 #define SWP_USED        1
  31 #define SWP_WRITEOK     3
  32 
  33 int min_free_pages = 20;
  34 
  35 static int nr_swapfiles = 0;
  36 static struct wait_queue * lock_queue = NULL;
  37 static struct {
  38         int head;       /* head of priority-ordered swapfile list */
  39         int next;       /* swapfile to be used next */
  40 } swap_list = {-1, -1};
  41 
  42 static struct swap_info_struct {
  43         unsigned int flags;
  44         unsigned int swap_device;
  45         struct inode * swap_file;
  46         unsigned char * swap_map;
  47         unsigned char * swap_lockmap;
  48         int lowest_bit;
  49         int highest_bit;
  50         int prio;                       /* swap priority */
  51         int pages;
  52         unsigned long max;
  53         int next;                       /* next entry on swap list */
  54 } swap_info[MAX_SWAPFILES];
  55 
  56 extern int shm_swap (int);
  57 
  58 unsigned long *swap_cache;
  59 
  60 #ifdef SWAP_CACHE_INFO
  61 unsigned long swap_cache_add_total = 0;
  62 unsigned long swap_cache_add_success = 0;
  63 unsigned long swap_cache_del_total = 0;
  64 unsigned long swap_cache_del_success = 0;
  65 unsigned long swap_cache_find_total = 0;
  66 unsigned long swap_cache_find_success = 0;
  67 
  68 extern inline void show_swap_cache_info(void)
     /* [previous][next][first][last][top][bottom][index][help] */
  69 {
  70         printk("Swap cache: add %ld/%ld, delete %ld/%ld, find %ld/%ld\n",
  71                 swap_cache_add_total, swap_cache_add_success, 
  72                 swap_cache_del_total, swap_cache_del_success,
  73                 swap_cache_find_total, swap_cache_find_success);
  74 }
  75 #endif
  76 
  77 static int add_to_swap_cache(unsigned long addr, unsigned long entry)
     /* [previous][next][first][last][top][bottom][index][help] */
  78 {
  79         struct swap_info_struct * p = &swap_info[SWP_TYPE(entry)];
  80 
  81 #ifdef SWAP_CACHE_INFO
  82         swap_cache_add_total++;
  83 #endif
  84         if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
  85                 entry = (unsigned long) xchg_ptr(swap_cache + MAP_NR(addr), (void *) entry);
  86                 if (entry)  {
  87                         printk("swap_cache: replacing non-NULL entry\n");
  88                 }
  89 #ifdef SWAP_CACHE_INFO
  90                 swap_cache_add_success++;
  91 #endif
  92                 return 1;
  93         }
  94         return 0;
  95 }
  96 
  97 static unsigned long init_swap_cache(unsigned long mem_start,
     /* [previous][next][first][last][top][bottom][index][help] */
  98         unsigned long mem_end)
  99 {
 100         unsigned long swap_cache_size;
 101 
 102         mem_start = (mem_start + 15) & ~15;
 103         swap_cache = (unsigned long *) mem_start;
 104         swap_cache_size = MAP_NR(mem_end);
 105         memset(swap_cache, 0, swap_cache_size * sizeof (unsigned long));
 106         return (unsigned long) (swap_cache + swap_cache_size);
 107 }
 108 
 109 void rw_swap_page(int rw, unsigned long entry, char * buf)
     /* [previous][next][first][last][top][bottom][index][help] */
 110 {
 111         unsigned long type, offset;
 112         struct swap_info_struct * p;
 113 
 114         type = SWP_TYPE(entry);
 115         if (type >= nr_swapfiles) {
 116                 printk("Internal error: bad swap-device\n");
 117                 return;
 118         }
 119         p = &swap_info[type];
 120         offset = SWP_OFFSET(entry);
 121         if (offset >= p->max) {
 122                 printk("rw_swap_page: weirdness\n");
 123                 return;
 124         }
 125         if (p->swap_map && !p->swap_map[offset]) {
 126                 printk("Hmm.. Trying to use unallocated swap (%08lx)\n", entry);
 127                 return;
 128         }
 129         if (!(p->flags & SWP_USED)) {
 130                 printk("Trying to swap to unused swap-device\n");
 131                 return;
 132         }
 133         while (set_bit(offset,p->swap_lockmap))
 134                 sleep_on(&lock_queue);
 135         if (rw == READ)
 136                 kstat.pswpin++;
 137         else
 138                 kstat.pswpout++;
 139         if (p->swap_device) {
 140                 ll_rw_page(rw,p->swap_device,offset,buf);
 141         } else if (p->swap_file) {
 142                 struct inode *swapf = p->swap_file;
 143                 unsigned int zones[PAGE_SIZE/512];
 144                 int i;
 145                 if (swapf->i_op->bmap == NULL
 146                         && swapf->i_op->smap != NULL){
 147                         /*
 148                                 With MsDOS, we use msdos_smap which return
 149                                 a sector number (not a cluster or block number).
 150                                 It is a patch to enable the UMSDOS project.
 151                                 Other people are working on better solution.
 152 
 153                                 It sounds like ll_rw_swap_file defined
 154                                 it operation size (sector size) based on
 155                                 PAGE_SIZE and the number of block to read.
 156                                 So using bmap or smap should work even if
 157                                 smap will require more blocks.
 158                         */
 159                         int j;
 160                         unsigned int block = offset << 3;
 161 
 162                         for (i=0, j=0; j< PAGE_SIZE ; i++, j += 512){
 163                                 if (!(zones[i] = swapf->i_op->smap(swapf,block++))) {
 164                                         printk("rw_swap_page: bad swap file\n");
 165                                         return;
 166                                 }
 167                         }
 168                 }else{
 169                         int j;
 170                         unsigned int block = offset
 171                                 << (PAGE_SHIFT - swapf->i_sb->s_blocksize_bits);
 172 
 173                         for (i=0, j=0; j< PAGE_SIZE ; i++, j +=swapf->i_sb->s_blocksize)
 174                                 if (!(zones[i] = bmap(swapf,block++))) {
 175                                         printk("rw_swap_page: bad swap file\n");
 176                                         return;
 177                                 }
 178                 }
 179                 ll_rw_swap_file(rw,swapf->i_dev, zones, i,buf);
 180         } else
 181                 printk("re_swap_page: no swap file or device\n");
 182         if (offset && !clear_bit(offset,p->swap_lockmap))
 183                 printk("rw_swap_page: lock already cleared\n");
 184         wake_up(&lock_queue);
 185 }
 186 
 187 unsigned long get_swap_page(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 188 {
 189         struct swap_info_struct * p;
 190         unsigned long offset, entry;
 191         int type, wrapped = 0;
 192 
 193         type = swap_list.next;
 194         if (type < 0)
 195           return 0;
 196 
 197         while (1) {
 198                 p = &swap_info[type];
 199                 if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
 200                         for (offset = p->lowest_bit; offset <= p->highest_bit ; offset++) {
 201                                 if (p->swap_map[offset])
 202                                   continue;
 203                                 if (test_bit(offset, p->swap_lockmap))
 204                                   continue;
 205                                 p->swap_map[offset] = 1;
 206                                 nr_swap_pages--;
 207                                 if (offset == p->highest_bit)
 208                                   p->highest_bit--;
 209                                 p->lowest_bit = offset;
 210                                 entry = SWP_ENTRY(type,offset);
 211 
 212                                 type = swap_info[type].next;
 213                                 if (type < 0 || p->prio != swap_info[type].prio) {
 214                                     swap_list.next = swap_list.head;
 215                                 } else {
 216                                     swap_list.next = type;
 217                                 }
 218                                 return entry;
 219                         }
 220                 }
 221                 type = p->next;
 222                 if (!wrapped) {
 223                         if (type < 0 || p->prio != swap_info[type].prio) {
 224                                 type = swap_list.head;
 225                                 wrapped = 1;
 226                         }
 227                 } else if (type < 0) {
 228                         return 0;       /* out of swap space */
 229                 }
 230         }
 231 }
 232 
 233 void swap_duplicate(unsigned long entry)
     /* [previous][next][first][last][top][bottom][index][help] */
 234 {
 235         struct swap_info_struct * p;
 236         unsigned long offset, type;
 237 
 238         if (!entry)
 239                 return;
 240         offset = SWP_OFFSET(entry);
 241         type = SWP_TYPE(entry);
 242         if (type & SHM_SWP_TYPE)
 243                 return;
 244         if (type >= nr_swapfiles) {
 245                 printk("Trying to duplicate nonexistent swap-page\n");
 246                 return;
 247         }
 248         p = type + swap_info;
 249         if (offset >= p->max) {
 250                 printk("swap_duplicate: weirdness\n");
 251                 return;
 252         }
 253         if (!p->swap_map[offset]) {
 254                 printk("swap_duplicate: trying to duplicate unused page\n");
 255                 return;
 256         }
 257         p->swap_map[offset]++;
 258         return;
 259 }
 260 
 261 void swap_free(unsigned long entry)
     /* [previous][next][first][last][top][bottom][index][help] */
 262 {
 263         struct swap_info_struct * p;
 264         unsigned long offset, type;
 265 
 266         if (!entry)
 267                 return;
 268         type = SWP_TYPE(entry);
 269         if (type & SHM_SWP_TYPE)
 270                 return;
 271         if (type >= nr_swapfiles) {
 272                 printk("Trying to free nonexistent swap-page\n");
 273                 return;
 274         }
 275         p = & swap_info[type];
 276         offset = SWP_OFFSET(entry);
 277         if (offset >= p->max) {
 278                 printk("swap_free: weirdness\n");
 279                 return;
 280         }
 281         if (!(p->flags & SWP_USED)) {
 282                 printk("Trying to free swap from unused swap-device\n");
 283                 return;
 284         }
 285         if (offset < p->lowest_bit)
 286                 p->lowest_bit = offset;
 287         if (offset > p->highest_bit)
 288                 p->highest_bit = offset;
 289         if (!p->swap_map[offset])
 290                 printk("swap_free: swap-space map bad (entry %08lx)\n",entry);
 291         else
 292                 if (!--p->swap_map[offset])
 293                         nr_swap_pages++;
 294         if (p->prio > swap_info[swap_list.next].prio) {
 295             swap_list.next = swap_list.head;
 296         }
 297 }
 298 
 299 /*
 300  * The tests may look silly, but it essentially makes sure that
 301  * no other process did a swap-in on us just as we were waiting.
 302  *
 303  * Also, don't bother to add to the swap cache if this page-in
 304  * was due to a write access.
 305  */
 306 void swap_in(struct vm_area_struct * vma, pte_t * page_table,
     /* [previous][next][first][last][top][bottom][index][help] */
 307         unsigned long entry, int write_access)
 308 {
 309         unsigned long page = __get_free_page(GFP_KERNEL);
 310 
 311         if (pte_val(*page_table) != entry) {
 312                 free_page(page);
 313                 return;
 314         }
 315         if (!page) {
 316                 *page_table = BAD_PAGE;
 317                 swap_free(entry);
 318                 oom(current);
 319                 return;
 320         }
 321         read_swap_page(entry, (char *) page);
 322         if (pte_val(*page_table) != entry) {
 323                 free_page(page);
 324                 return;
 325         }
 326         vma->vm_task->mm->rss++;
 327         vma->vm_task->mm->maj_flt++;
 328         if (!write_access && add_to_swap_cache(page, entry)) {
 329                 *page_table = mk_pte(page, vma->vm_page_prot);
 330                 return;
 331         }
 332         *page_table = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 333         swap_free(entry);
 334         return;
 335 }
 336 
 337 /*
 338  * The swap-out functions return 1 if they successfully
 339  * threw something out, and we got a free page. It returns
 340  * zero if it couldn't do anything, and any other value
 341  * indicates it decreased rss, but the page was shared.
 342  *
 343  * NOTE! If it sleeps, it *must* return 1 to make sure we
 344  * don't continue with the swap-out. Otherwise we may be
 345  * using a process that no longer actually exists (it might
 346  * have died while we slept).
 347  */
 348 static inline int try_to_swap_out(struct vm_area_struct* vma, unsigned long address, pte_t * page_table)
     /* [previous][next][first][last][top][bottom][index][help] */
 349 {
 350         pte_t pte;
 351         unsigned long entry;
 352         unsigned long page;
 353 
 354         pte = *page_table;
 355         if (!pte_present(pte))
 356                 return 0;
 357         page = pte_page(pte);
 358         if (page >= high_memory)
 359                 return 0;
 360         if (mem_map[MAP_NR(page)] & MAP_PAGE_RESERVED)
 361                 return 0;
 362         if ((pte_dirty(pte) && delete_from_swap_cache(page)) || pte_young(pte))  {
 363                 *page_table = pte_mkold(pte);
 364                 return 0;
 365         }       
 366         if (pte_dirty(pte)) {
 367                 if (mem_map[MAP_NR(page)] != 1)
 368                         return 0;
 369                 if (vma->vm_ops && vma->vm_ops->swapout) {
 370                         vma->vm_task->mm->rss--;
 371                         vma->vm_ops->swapout(vma, address-vma->vm_start, page_table);
 372                 } else {
 373                         if (!(entry = get_swap_page()))
 374                                 return 0;
 375                         vma->vm_task->mm->rss--;
 376                         pte_val(*page_table) = entry;
 377                         invalidate();
 378                         write_swap_page(entry, (char *) page);
 379                 }
 380                 free_page(page);
 381                 return 1;       /* we slept: the process may not exist any more */
 382         }
 383         if ((entry = find_in_swap_cache(page)))  {
 384                 if (mem_map[MAP_NR(page)] != 1) {
 385                         *page_table = pte_mkdirty(pte);
 386                         printk("Aiee.. duplicated cached swap-cache entry\n");
 387                         return 0;
 388                 }
 389                 vma->vm_task->mm->rss--;
 390                 pte_val(*page_table) = entry;
 391                 invalidate();
 392                 free_page(page);
 393                 return 1;
 394         } 
 395         vma->vm_task->mm->rss--;
 396         pte_clear(page_table);
 397         invalidate();
 398         entry = mem_map[MAP_NR(page)];
 399         free_page(page);
 400         return entry;
 401 }
 402 
 403 /*
 404  * A new implementation of swap_out().  We do not swap complete processes,
 405  * but only a small number of blocks, before we continue with the next
 406  * process.  The number of blocks actually swapped is determined on the
 407  * number of page faults, that this process actually had in the last time,
 408  * so we won't swap heavily used processes all the time ...
 409  *
 410  * Note: the priority argument is a hint on much CPU to waste with the
 411  *       swap block search, not a hint, of how much blocks to swap with
 412  *       each process.
 413  *
 414  * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
 415  */
 416 
 417 /*
 418  * These are the minimum and maximum number of pages to swap from one process,
 419  * before proceeding to the next:
 420  */
 421 #define SWAP_MIN        4
 422 #define SWAP_MAX        32
 423 
 424 /*
 425  * The actual number of pages to swap is determined as:
 426  * SWAP_RATIO / (number of recent major page faults)
 427  */
 428 #define SWAP_RATIO      128
 429 
 430 static inline int swap_out_pmd(struct vm_area_struct * vma, pmd_t *dir,
     /* [previous][next][first][last][top][bottom][index][help] */
 431         unsigned long address, unsigned long end)
 432 {
 433         pte_t * pte;
 434         unsigned long pmd_end;
 435 
 436         if (pmd_none(*dir))
 437                 return 0;
 438         if (pmd_bad(*dir)) {
 439                 printk("swap_out_pmd: bad pmd (%08lx)\n", pmd_val(*dir));
 440                 pmd_clear(dir);
 441                 return 0;
 442         }
 443         
 444         pte = pte_offset(dir, address);
 445         
 446         pmd_end = (address + PMD_SIZE) & PMD_MASK;
 447         if (end > pmd_end)
 448                 end = pmd_end;
 449 
 450         do {
 451                 int result;
 452                 vma->vm_task->mm->swap_address = address + PAGE_SIZE;
 453                 result = try_to_swap_out(vma, address, pte);
 454                 if (result)
 455                         return result;
 456                 address += PAGE_SIZE;
 457                 pte++;
 458         } while (address < end);
 459         return 0;
 460 }
 461 
 462 static inline int swap_out_pgd(struct vm_area_struct * vma, pgd_t *dir,
     /* [previous][next][first][last][top][bottom][index][help] */
 463         unsigned long address, unsigned long end)
 464 {
 465         pmd_t * pmd;
 466         unsigned long pgd_end;
 467 
 468         if (pgd_none(*dir))
 469                 return 0;
 470         if (pgd_bad(*dir)) {
 471                 printk("swap_out_pgd: bad pgd (%08lx)\n", pgd_val(*dir));
 472                 pgd_clear(dir);
 473                 return 0;
 474         }
 475 
 476         pmd = pmd_offset(dir, address);
 477 
 478         pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;  
 479         if (end > pgd_end)
 480                 end = pgd_end;
 481         
 482         do {
 483                 int result = swap_out_pmd(vma, pmd, address, end);
 484                 if (result)
 485                         return result;
 486                 address = (address + PMD_SIZE) & PMD_MASK;
 487                 pmd++;
 488         } while (address < end);
 489         return 0;
 490 }
 491 
 492 static int swap_out_vma(struct vm_area_struct * vma, pgd_t *pgdir,
     /* [previous][next][first][last][top][bottom][index][help] */
 493         unsigned long start)
 494 {
 495         unsigned long end;
 496 
 497         /* Don't swap out areas like shared memory which have their
 498             own separate swapping mechanism. */
 499         if (vma->vm_flags & VM_SHM)
 500                 return 0;
 501 
 502         end = vma->vm_end;
 503         while (start < end) {
 504                 int result = swap_out_pgd(vma, pgdir, start, end);
 505                 if (result)
 506                         return result;
 507                 start = (start + PGDIR_SIZE) & PGDIR_MASK;
 508                 pgdir++;
 509         }
 510         return 0;
 511 }
 512 
 513 static int swap_out_process(struct task_struct * p)
     /* [previous][next][first][last][top][bottom][index][help] */
 514 {
 515         unsigned long address;
 516         struct vm_area_struct* vma;
 517 
 518         /*
 519          * Go through process' page directory.
 520          */
 521         address = p->mm->swap_address;
 522         p->mm->swap_address = 0;
 523 
 524         /*
 525          * Find the proper vm-area
 526          */
 527         vma = find_vma(p, address);
 528         if (!vma)
 529                 return 0;
 530         if (address < vma->vm_start)
 531                 address = vma->vm_start;
 532 
 533         for (;;) {
 534                 int result = swap_out_vma(vma, pgd_offset(p, address), address);
 535                 if (result)
 536                         return result;
 537                 vma = vma->vm_next;
 538                 if (!vma)
 539                         break;
 540                 address = vma->vm_start;
 541         }
 542         p->mm->swap_address = 0;
 543         return 0;
 544 }
 545 
 546 static int swap_out(unsigned int priority)
     /* [previous][next][first][last][top][bottom][index][help] */
 547 {
 548         static int swap_task;
 549         int loop, counter;
 550         struct task_struct *p;
 551 
 552         counter = 6*nr_tasks >> priority;
 553         for(; counter >= 0; counter--) {
 554                 /*
 555                  * Check that swap_task is suitable for swapping.  If not, look for
 556                  * the next suitable process.
 557                  */
 558                 loop = 0;
 559                 while(1) {
 560                         if (swap_task >= NR_TASKS) {
 561                                 swap_task = 1;
 562                                 if (loop)
 563                                         /* all processes are unswappable or already swapped out */
 564                                         return 0;
 565                                 loop = 1;
 566                         }
 567 
 568                         p = task[swap_task];
 569                         if (p && p->mm->swappable && p->mm->rss)
 570                                 break;
 571 
 572                         swap_task++;
 573                 }
 574 
 575                 /*
 576                  * Determine the number of pages to swap from this process.
 577                  */
 578                 if (!p->mm->swap_cnt) {
 579                         p->mm->dec_flt = (p->mm->dec_flt * 3) / 4 + p->mm->maj_flt - p->mm->old_maj_flt;
 580                         p->mm->old_maj_flt = p->mm->maj_flt;
 581 
 582                         if (p->mm->dec_flt >= SWAP_RATIO / SWAP_MIN) {
 583                                 p->mm->dec_flt = SWAP_RATIO / SWAP_MIN;
 584                                 p->mm->swap_cnt = SWAP_MIN;
 585                         } else if (p->mm->dec_flt <= SWAP_RATIO / SWAP_MAX)
 586                                 p->mm->swap_cnt = SWAP_MAX;
 587                         else
 588                                 p->mm->swap_cnt = SWAP_RATIO / p->mm->dec_flt;
 589                 }
 590                 if (!--p->mm->swap_cnt)
 591                         swap_task++;
 592                 switch (swap_out_process(p)) {
 593                         case 0:
 594                                 if (p->mm->swap_cnt)
 595                                         swap_task++;
 596                                 break;
 597                         case 1:
 598                                 return 1;
 599                         default:
 600                                 break;
 601                 }
 602         }
 603         return 0;
 604 }
 605 
 606 /*
 607  * we keep on shrinking one resource until it's considered "too hard",
 608  * and then switch to the next one (priority being an indication on how
 609  * hard we should try with the resource).
 610  *
 611  * This should automatically find the resource that can most easily be
 612  * free'd, so hopefully we'll get reasonable behaviour even under very
 613  * different circumstances.
 614  */
 615 static int try_to_free_page(int priority)
     /* [previous][next][first][last][top][bottom][index][help] */
 616 {
 617         static int state = 0;
 618         int i=6;
 619 
 620         switch (state) {
 621                 do {
 622                 case 0:
 623                         if (priority != GFP_NOBUFFER && shrink_buffers(i))
 624                                 return 1;
 625                         state = 1;
 626                 case 1:
 627                         if (shm_swap(i))
 628                                 return 1;
 629                         state = 2;
 630                 default:
 631                         if (swap_out(i))
 632                                 return 1;
 633                         state = 0;
 634                 } while(i--);
 635         }
 636         return 0;
 637 }
 638 
 639 static inline void add_mem_queue(struct mem_list * head, struct mem_list * entry)
     /* [previous][next][first][last][top][bottom][index][help] */
 640 {
 641         entry->prev = head;
 642         (entry->next = head->next)->prev = entry;
 643         head->next = entry;
 644 }
 645 
 646 static inline void remove_mem_queue(struct mem_list * head, struct mem_list * entry)
     /* [previous][next][first][last][top][bottom][index][help] */
 647 {
 648         entry->next->prev = entry->prev;
 649         entry->prev->next = entry->next;
 650 }
 651 
 652 /*
 653  * Free_page() adds the page to the free lists. This is optimized for
 654  * fast normal cases (no error jumps taken normally).
 655  *
 656  * The way to optimize jumps for gcc-2.2.2 is to:
 657  *  - select the "normal" case and put it inside the if () { XXX }
 658  *  - no else-statements if you can avoid them
 659  *
 660  * With the above two rules, you get a straight-line execution path
 661  * for the normal case, giving better asm-code.
 662  *
 663  * free_page() may sleep since the page being freed may be a buffer
 664  * page or present in the swap cache. It will not sleep, however,
 665  * for a freshly allocated page (get_free_page()).
 666  */
 667 
 668 /*
 669  * Buddy system. Hairy. You really aren't expected to understand this
 670  */
 671 static inline void free_pages_ok(unsigned long addr, unsigned long order)
     /* [previous][next][first][last][top][bottom][index][help] */
 672 {
 673         unsigned long index = MAP_NR(addr) >> (1 + order);
 674         unsigned long mask = PAGE_MASK << order;
 675 
 676         addr &= mask;
 677         nr_free_pages += 1 << order;
 678         while (order < NR_MEM_LISTS-1) {
 679                 if (!change_bit(index, free_area_map[order]))
 680                         break;
 681                 remove_mem_queue(free_area_list+order, (struct mem_list *) (addr ^ (1+~mask)));
 682                 order++;
 683                 index >>= 1;
 684                 mask <<= 1;
 685                 addr &= mask;
 686         }
 687         add_mem_queue(free_area_list+order, (struct mem_list *) addr);
 688 }
 689 
 690 static inline void check_free_buffers(unsigned long addr)
     /* [previous][next][first][last][top][bottom][index][help] */
 691 {
 692         struct buffer_head * bh;
 693 
 694         bh = buffer_pages[MAP_NR(addr)];
 695         if (bh) {
 696                 struct buffer_head *tmp = bh;
 697                 do {
 698                         if (tmp->b_list == BUF_SHARED && tmp->b_dev != 0xffff)
 699                                 refile_buffer(tmp);
 700                         tmp = tmp->b_this_page;
 701                 } while (tmp != bh);
 702         }
 703 }
 704 
 705 void free_pages(unsigned long addr, unsigned long order)
     /* [previous][next][first][last][top][bottom][index][help] */
 706 {
 707         if (addr < high_memory) {
 708                 unsigned long flag;
 709                 mem_map_t * map = mem_map + MAP_NR(addr);
 710                 if (*map) {
 711                         if (!(*map & MAP_PAGE_RESERVED)) {
 712                                 save_flags(flag);
 713                                 cli();
 714                                 if (!--*map)  {
 715                                         free_pages_ok(addr, order);
 716                                         delete_from_swap_cache(addr);
 717                                 }
 718                                 restore_flags(flag);
 719                                 if (*map == 1)
 720                                         check_free_buffers(addr);
 721                         }
 722                         return;
 723                 }
 724                 printk("Trying to free free memory (%08lx): memory probably corrupted\n",addr);
 725                 printk("PC = %p\n", __builtin_return_address(0));
 726                 return;
 727         }
 728 }
 729 
 730 /*
 731  * Some ugly macros to speed up __get_free_pages()..
 732  */
 733 #define RMQUEUE(order) \
 734 do { struct mem_list * queue = free_area_list+order; \
 735      unsigned long new_order = order; \
 736         do { struct mem_list *next = queue->next; \
 737                 if (queue != next) { \
 738                         (queue->next = next->next)->prev = queue; \
 739                         mark_used((unsigned long) next, new_order); \
 740                         nr_free_pages -= 1 << order; \
 741                         restore_flags(flags); \
 742                         EXPAND(next, order, new_order); \
 743                         return (unsigned long) next; \
 744                 } new_order++; queue++; \
 745         } while (new_order < NR_MEM_LISTS); \
 746 } while (0)
 747 
 748 static inline int mark_used(unsigned long addr, unsigned long order)
     /* [previous][next][first][last][top][bottom][index][help] */
 749 {
 750         return change_bit(MAP_NR(addr) >> (1+order), free_area_map[order]);
 751 }
 752 
 753 #define EXPAND(addr,low,high) \
 754 do { unsigned long size = PAGE_SIZE << high; \
 755         while (high > low) { \
 756                 high--; size >>= 1; cli(); \
 757                 add_mem_queue(free_area_list+high, addr); \
 758                 mark_used((unsigned long) addr, high); \
 759                 restore_flags(flags); \
 760                 addr = (struct mem_list *) (size + (unsigned long) addr); \
 761         } mem_map[MAP_NR((unsigned long) addr)] = 1; \
 762 } while (0)
 763 
 764 unsigned long __get_free_pages(int priority, unsigned long order)
     /* [previous][next][first][last][top][bottom][index][help] */
 765 {
 766         unsigned long flags;
 767         int reserved_pages;
 768 
 769         if (intr_count && priority != GFP_ATOMIC) {
 770                 static int count = 0;
 771                 if (++count < 5) {
 772                         printk("gfp called nonatomically from interrupt %p\n",
 773                                 __builtin_return_address(0));
 774                         priority = GFP_ATOMIC;
 775                 }
 776         }
 777         reserved_pages = 5;
 778         if (priority != GFP_NFS)
 779                 reserved_pages = min_free_pages;
 780         save_flags(flags);
 781 repeat:
 782         cli();
 783         if ((priority==GFP_ATOMIC) || nr_free_pages > reserved_pages) {
 784                 RMQUEUE(order);
 785                 restore_flags(flags);
 786                 return 0;
 787         }
 788         restore_flags(flags);
 789         if (priority != GFP_BUFFER && try_to_free_page(priority))
 790                 goto repeat;
 791         return 0;
 792 }
 793 
 794 /*
 795  * Yes, I know this is ugly. Don't tell me.
 796  */
 797 unsigned long __get_dma_pages(int priority, unsigned long order)
     /* [previous][next][first][last][top][bottom][index][help] */
 798 {
 799         unsigned long list = 0;
 800         unsigned long result;
 801         unsigned long limit = MAX_DMA_ADDRESS;
 802 
 803         /* if (EISA_bus) limit = ~0UL; */
 804         if (priority != GFP_ATOMIC)
 805                 priority = GFP_BUFFER;
 806         for (;;) {
 807                 result = __get_free_pages(priority, order);
 808                 if (result < limit) /* covers failure as well */
 809                         break;
 810                 *(unsigned long *) result = list;
 811                 list = result;
 812         }
 813         while (list) {
 814                 unsigned long tmp = list;
 815                 list = *(unsigned long *) list;
 816                 free_pages(tmp, order);
 817         }
 818         return result;
 819 }
 820 
 821 /*
 822  * Show free area list (used inside shift_scroll-lock stuff)
 823  * We also calculate the percentage fragmentation. We do this by counting the
 824  * memory on each free list with the exception of the first item on the list.
 825  */
 826 void show_free_areas(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 827 {
 828         unsigned long order, flags;
 829         unsigned long total = 0;
 830 
 831         printk("Free pages:      %6dkB\n ( ",nr_free_pages<<(PAGE_SHIFT-10));
 832         save_flags(flags);
 833         cli();
 834         for (order=0 ; order < NR_MEM_LISTS; order++) {
 835                 struct mem_list * tmp;
 836                 unsigned long nr = 0;
 837                 for (tmp = free_area_list[order].next ; tmp != free_area_list + order ; tmp = tmp->next) {
 838                         nr ++;
 839                 }
 840                 total += nr * ((PAGE_SIZE>>10) << order);
 841                 printk("%lu*%lukB ", nr, (PAGE_SIZE>>10) << order);
 842         }
 843         restore_flags(flags);
 844         printk("= %lukB)\n", total);
 845 #ifdef SWAP_CACHE_INFO
 846         show_swap_cache_info();
 847 #endif  
 848 }
 849 
 850 /*
 851  * Trying to stop swapping from a file is fraught with races, so
 852  * we repeat quite a bit here when we have to pause. swapoff()
 853  * isn't exactly timing-critical, so who cares (but this is /really/
 854  * inefficient, ugh).
 855  *
 856  * We return 1 after having slept, which makes the process start over
 857  * from the beginning for this process..
 858  */
 859 static inline int unuse_pte(struct vm_area_struct * vma, unsigned long address,
     /* [previous][next][first][last][top][bottom][index][help] */
 860         pte_t *dir, unsigned int type, unsigned long page)
 861 {
 862         pte_t pte = *dir;
 863 
 864         if (pte_none(pte))
 865                 return 0;
 866         if (pte_present(pte)) {
 867                 unsigned long page = pte_page(pte);
 868                 if (page >= high_memory)
 869                         return 0;
 870                 if (!in_swap_cache(page))
 871                         return 0;
 872                 if (SWP_TYPE(in_swap_cache(page)) != type)
 873                         return 0;
 874                 delete_from_swap_cache(page);
 875                 *dir = pte_mkdirty(pte);
 876                 return 0;
 877         }
 878         if (SWP_TYPE(pte_val(pte)) != type)
 879                 return 0;
 880         read_swap_page(pte_val(pte), (char *) page);
 881         if (pte_val(*dir) != pte_val(pte)) {
 882                 free_page(page);
 883                 return 1;
 884         }
 885         *dir = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 886         ++vma->vm_task->mm->rss;
 887         swap_free(pte_val(pte));
 888         return 1;
 889 }
 890 
 891 static inline int unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
     /* [previous][next][first][last][top][bottom][index][help] */
 892         unsigned long address, unsigned long size, unsigned long offset,
 893         unsigned int type, unsigned long page)
 894 {
 895         pte_t * pte;
 896         unsigned long end;
 897 
 898         if (pmd_none(*dir))
 899                 return 0;
 900         if (pmd_bad(*dir)) {
 901                 printk("unuse_pmd: bad pmd (%08lx)\n", pmd_val(*dir));
 902                 pmd_clear(dir);
 903                 return 0;
 904         }
 905         pte = pte_offset(dir, address);
 906         offset += address & PMD_MASK;
 907         address &= ~PMD_MASK;
 908         end = address + size;
 909         if (end > PMD_SIZE)
 910                 end = PMD_SIZE;
 911         do {
 912                 if (unuse_pte(vma, offset+address-vma->vm_start, pte, type, page))
 913                         return 1;
 914                 address += PAGE_SIZE;
 915                 pte++;
 916         } while (address < end);
 917         return 0;
 918 }
 919 
 920 static inline int unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
     /* [previous][next][first][last][top][bottom][index][help] */
 921         unsigned long address, unsigned long size,
 922         unsigned int type, unsigned long page)
 923 {
 924         pmd_t * pmd;
 925         unsigned long offset, end;
 926 
 927         if (pgd_none(*dir))
 928                 return 0;
 929         if (pgd_bad(*dir)) {
 930                 printk("unuse_pgd: bad pgd (%08lx)\n", pgd_val(*dir));
 931                 pgd_clear(dir);
 932                 return 0;
 933         }
 934         pmd = pmd_offset(dir, address);
 935         offset = address & PGDIR_MASK;
 936         address &= ~PGDIR_MASK;
 937         end = address + size;
 938         if (end > PGDIR_SIZE)
 939                 end = PGDIR_SIZE;
 940         do {
 941                 if (unuse_pmd(vma, pmd, address, end - address, offset, type, page))
 942                         return 1;
 943                 address = (address + PMD_SIZE) & PMD_MASK;
 944                 pmd++;
 945         } while (address < end);
 946         return 0;
 947 }
 948 
 949 static int unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir,
     /* [previous][next][first][last][top][bottom][index][help] */
 950         unsigned long start, unsigned long end,
 951         unsigned int type, unsigned long page)
 952 {
 953         while (start < end) {
 954                 if (unuse_pgd(vma, pgdir, start, end - start, type, page))
 955                         return 1;
 956                 start = (start + PGDIR_SIZE) & PGDIR_MASK;
 957                 pgdir++;
 958         }
 959         return 0;
 960 }
 961 
 962 static int unuse_process(struct task_struct * p, unsigned int type, unsigned long page)
     /* [previous][next][first][last][top][bottom][index][help] */
 963 {
 964         struct vm_area_struct* vma;
 965 
 966         /*
 967          * Go through process' page directory.
 968          */
 969         vma = p->mm->mmap;
 970         while (vma) {
 971                 pgd_t * pgd = pgd_offset(p, vma->vm_start);
 972                 if (unuse_vma(vma, pgd, vma->vm_start, vma->vm_end, type, page))
 973                         return 1;
 974                 vma = vma->vm_next;
 975         }
 976         return 0;
 977 }
 978 
 979 /*
 980  * To avoid races, we repeat for each process after having
 981  * swapped something in. That gets rid of a few pesky races,
 982  * and "swapoff" isn't exactly timing critical.
 983  */
 984 static int try_to_unuse(unsigned int type)
     /* [previous][next][first][last][top][bottom][index][help] */
 985 {
 986         int nr;
 987         unsigned long page = get_free_page(GFP_KERNEL);
 988 
 989         if (!page)
 990                 return -ENOMEM;
 991         nr = 0;
 992         while (nr < NR_TASKS) {
 993                 if (task[nr]) {
 994                         if (unuse_process(task[nr], type, page)) {
 995                                 page = get_free_page(GFP_KERNEL);
 996                                 if (!page)
 997                                         return -ENOMEM;
 998                                 continue;
 999                         }
1000                 }
1001                 nr++;
1002         }
1003         free_page(page);
1004         return 0;
1005 }
1006 
1007 asmlinkage int sys_swapoff(const char * specialfile)
     /* [previous][next][first][last][top][bottom][index][help] */
1008 {
1009         struct swap_info_struct * p;
1010         struct inode * inode;
1011         struct file filp;
1012         int i, type, prev;
1013 
1014         if (!suser())
1015                 return -EPERM;
1016         i = namei(specialfile,&inode);
1017         if (i)
1018                 return i;
1019         prev = -1;
1020         for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
1021                 p = swap_info + type;
1022                 if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
1023                         if (p->swap_file) {
1024                                 if (p->swap_file == inode)
1025                                   break;
1026                         } else {
1027                                 if (S_ISBLK(inode->i_mode)
1028                                     && (p->swap_device == inode->i_rdev))
1029                                   break;
1030                         }
1031                 }
1032                 prev = type;
1033         }
1034         if (type < 0){
1035                 iput(inode);
1036                 return -EINVAL;
1037         }
1038         if (prev < 0) {
1039                 swap_list.head = p->next;
1040         } else {
1041                 swap_info[prev].next = p->next;
1042         }
1043         if (type == swap_list.next) {
1044                 /* just pick something that's safe... */
1045                 swap_list.next = swap_list.head;
1046         }
1047         p->flags = SWP_USED;
1048         i = try_to_unuse(type);
1049         if (i) {
1050                 iput(inode);
1051                 p->flags = SWP_WRITEOK;
1052                 return i;
1053         }
1054 
1055         if(p->swap_device){
1056                 memset(&filp, 0, sizeof(filp));         
1057                 filp.f_inode = inode;
1058                 filp.f_mode = 3; /* read write */
1059                 /* open it again to get fops */
1060                 if( !blkdev_open(inode, &filp) &&
1061                    filp.f_op && filp.f_op->release){
1062                         filp.f_op->release(inode,&filp);
1063                         filp.f_op->release(inode,&filp);
1064                 }
1065         }
1066         iput(inode);
1067 
1068         nr_swap_pages -= p->pages;
1069         iput(p->swap_file);
1070         p->swap_file = NULL;
1071         p->swap_device = 0;
1072         vfree(p->swap_map);
1073         p->swap_map = NULL;
1074         free_page((long) p->swap_lockmap);
1075         p->swap_lockmap = NULL;
1076         p->flags = 0;
1077         return 0;
1078 }
1079 
1080 /*
1081  * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
1082  *
1083  * The swapon system call
1084  */
1085 asmlinkage int sys_swapon(const char * specialfile, int swap_flags)
     /* [previous][next][first][last][top][bottom][index][help] */
1086 {
1087         struct swap_info_struct * p;
1088         struct inode * swap_inode;
1089         unsigned int type;
1090         int i, j, prev;
1091         int error;
1092         struct file filp;
1093         static int least_priority = 0;
1094 
1095         memset(&filp, 0, sizeof(filp));
1096         if (!suser())
1097                 return -EPERM;
1098         p = swap_info;
1099         for (type = 0 ; type < nr_swapfiles ; type++,p++)
1100                 if (!(p->flags & SWP_USED))
1101                         break;
1102         if (type >= MAX_SWAPFILES)
1103                 return -EPERM;
1104         if (type >= nr_swapfiles)
1105                 nr_swapfiles = type+1;
1106         p->flags = SWP_USED;
1107         p->swap_file = NULL;
1108         p->swap_device = 0;
1109         p->swap_map = NULL;
1110         p->swap_lockmap = NULL;
1111         p->lowest_bit = 0;
1112         p->highest_bit = 0;
1113         p->max = 1;
1114         p->next = -1;
1115         if (swap_flags & SWAP_FLAG_PREFER) {
1116                 p->prio =
1117                   (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT;
1118         } else {
1119                 p->prio = --least_priority;
1120         }
1121         error = namei(specialfile,&swap_inode);
1122         if (error)
1123                 goto bad_swap_2;
1124         p->swap_file = swap_inode;
1125         error = -EBUSY;
1126         if (swap_inode->i_count != 1)
1127                 goto bad_swap_2;
1128         error = -EINVAL;
1129 
1130         if (S_ISBLK(swap_inode->i_mode)) {
1131                 p->swap_device = swap_inode->i_rdev;
1132 
1133                 filp.f_inode = swap_inode;
1134                 filp.f_mode = 3; /* read write */
1135                 error = blkdev_open(swap_inode, &filp);
1136                 p->swap_file = NULL;
1137                 iput(swap_inode);
1138                 if(error)
1139                         goto bad_swap_2;
1140                 error = -ENODEV;
1141                 if (!p->swap_device)
1142                         goto bad_swap;
1143                 error = -EBUSY;
1144                 for (i = 0 ; i < nr_swapfiles ; i++) {
1145                         if (i == type)
1146                                 continue;
1147                         if (p->swap_device == swap_info[i].swap_device)
1148                                 goto bad_swap;
1149                 }
1150         } else if (!S_ISREG(swap_inode->i_mode))
1151                 goto bad_swap;
1152         p->swap_lockmap = (unsigned char *) get_free_page(GFP_USER);
1153         if (!p->swap_lockmap) {
1154                 printk("Unable to start swapping: out of memory :-)\n");
1155                 error = -ENOMEM;
1156                 goto bad_swap;
1157         }
1158         read_swap_page(SWP_ENTRY(type,0), (char *) p->swap_lockmap);
1159         if (memcmp("SWAP-SPACE",p->swap_lockmap+PAGE_SIZE-10,10)) {
1160                 printk("Unable to find swap-space signature\n");
1161                 error = -EINVAL;
1162                 goto bad_swap;
1163         }
1164         memset(p->swap_lockmap+PAGE_SIZE-10,0,10);
1165         j = 0;
1166         p->lowest_bit = 0;
1167         p->highest_bit = 0;
1168         for (i = 1 ; i < 8*PAGE_SIZE ; i++) {
1169                 if (test_bit(i,p->swap_lockmap)) {
1170                         if (!p->lowest_bit)
1171                                 p->lowest_bit = i;
1172                         p->highest_bit = i;
1173                         p->max = i+1;
1174                         j++;
1175                 }
1176         }
1177         if (!j) {
1178                 printk("Empty swap-file\n");
1179                 error = -EINVAL;
1180                 goto bad_swap;
1181         }
1182         p->swap_map = (unsigned char *) vmalloc(p->max);
1183         if (!p->swap_map) {
1184                 error = -ENOMEM;
1185                 goto bad_swap;
1186         }
1187         for (i = 1 ; i < p->max ; i++) {
1188                 if (test_bit(i,p->swap_lockmap))
1189                         p->swap_map[i] = 0;
1190                 else
1191                         p->swap_map[i] = 0x80;
1192         }
1193         p->swap_map[0] = 0x80;
1194         memset(p->swap_lockmap,0,PAGE_SIZE);
1195         p->flags = SWP_WRITEOK;
1196         p->pages = j;
1197         nr_swap_pages += j;
1198         printk("Adding Swap: %dk swap-space\n",j<<(PAGE_SHIFT-10));
1199 
1200         /* insert swap space into swap_list: */
1201         prev = -1;
1202         for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
1203                 if (p->prio >= swap_info[i].prio) {
1204                         break;
1205                 }
1206                 prev = i;
1207         }
1208         p->next = i;
1209         if (prev < 0) {
1210                 swap_list.head = swap_list.next = p - swap_info;
1211         } else {
1212                 swap_info[prev].next = p - swap_info;
1213         }
1214         return 0;
1215 bad_swap:
1216         if(filp.f_op && filp.f_op->release)
1217                 filp.f_op->release(filp.f_inode,&filp);
1218 bad_swap_2:
1219         free_page((long) p->swap_lockmap);
1220         vfree(p->swap_map);
1221         iput(p->swap_file);
1222         p->swap_device = 0;
1223         p->swap_file = NULL;
1224         p->swap_map = NULL;
1225         p->swap_lockmap = NULL;
1226         p->flags = 0;
1227         return error;
1228 }
1229 
1230 void si_swapinfo(struct sysinfo *val)
     /* [previous][next][first][last][top][bottom][index][help] */
1231 {
1232         unsigned int i, j;
1233 
1234         val->freeswap = val->totalswap = 0;
1235         for (i = 0; i < nr_swapfiles; i++) {
1236                 if ((swap_info[i].flags & SWP_WRITEOK) != SWP_WRITEOK)
1237                         continue;
1238                 for (j = 0; j < swap_info[i].max; ++j)
1239                         switch (swap_info[i].swap_map[j]) {
1240                                 case 128:
1241                                         continue;
1242                                 case 0:
1243                                         ++val->freeswap;
1244                                 default:
1245                                         ++val->totalswap;
1246                         }
1247         }
1248         val->freeswap <<= PAGE_SHIFT;
1249         val->totalswap <<= PAGE_SHIFT;
1250         return;
1251 }
1252 
1253 /*
1254  * set up the free-area data structures:
1255  *   - mark all pages MAP_PAGE_RESERVED
1256  *   - mark all memory queues empty
1257  *   - clear the memory bitmaps
1258  */
1259 unsigned long free_area_init(unsigned long start_mem, unsigned long end_mem)
     /* [previous][next][first][last][top][bottom][index][help] */
1260 {
1261         mem_map_t * p;
1262         unsigned long mask = PAGE_MASK;
1263         int i;
1264 
1265         /*
1266          * select nr of pages we try to keep free for important stuff
1267          * with a minimum of 16 pages. This is totally arbitrary
1268          */
1269         i = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT+6);
1270         if (i < 16)
1271                 i = 16;
1272         min_free_pages = i;
1273         start_mem = init_swap_cache(start_mem, end_mem);
1274         mem_map = (mem_map_t *) start_mem;
1275         p = mem_map + MAP_NR(end_mem);
1276         start_mem = (unsigned long) p;
1277         while (p > mem_map)
1278                 *--p = MAP_PAGE_RESERVED;
1279 
1280         for (i = 0 ; i < NR_MEM_LISTS ; i++) {
1281                 unsigned long bitmap_size;
1282                 free_area_list[i].prev = free_area_list[i].next = &free_area_list[i];
1283                 mask += mask;
1284                 end_mem = (end_mem + ~mask) & mask;
1285                 bitmap_size = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT + i);
1286                 bitmap_size = (bitmap_size + 7) >> 3;
1287                 bitmap_size = (bitmap_size + sizeof(unsigned long) - 1) & ~(sizeof(unsigned long)-1);
1288                 free_area_map[i] = (unsigned char *) start_mem;
1289                 memset((void *) start_mem, 0, bitmap_size);
1290                 start_mem += bitmap_size;
1291         }
1292         return start_mem;
1293 }

/* [previous][next][first][last][top][bottom][index][help] */