root/mm/swap.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. show_swap_cache_info
  2. add_to_swap_cache
  3. init_swap_cache
  4. rw_swap_page
  5. get_swap_page
  6. swap_duplicate
  7. swap_free
  8. swap_in
  9. try_to_swap_out
  10. swap_out_pmd
  11. swap_out_pgd
  12. swap_out_vma
  13. swap_out_process
  14. swap_out
  15. try_to_free_page
  16. add_mem_queue
  17. remove_mem_queue
  18. free_pages_ok
  19. check_free_buffers
  20. free_pages
  21. mark_used
  22. __get_free_pages
  23. show_free_areas
  24. unuse_pte
  25. unuse_pmd
  26. unuse_pgd
  27. unuse_vma
  28. unuse_process
  29. try_to_unuse
  30. sys_swapoff
  31. sys_swapon
  32. si_swapinfo
  33. free_area_init

   1 /*
   2  *  linux/mm/swap.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  */
   6 
   7 /*
   8  * This file should contain most things doing the swapping from/to disk.
   9  * Started 18.12.91
  10  */
  11 
  12 #include <linux/mm.h>
  13 #include <linux/sched.h>
  14 #include <linux/head.h>
  15 #include <linux/kernel.h>
  16 #include <linux/kernel_stat.h>
  17 #include <linux/errno.h>
  18 #include <linux/string.h>
  19 #include <linux/stat.h>
  20 #include <linux/swap.h>
  21 #include <linux/fs.h>
  22 
  23 #include <asm/dma.h>
  24 #include <asm/system.h> /* for cli()/sti() */
  25 #include <asm/bitops.h>
  26 #include <asm/pgtable.h>
  27 
  28 #define MAX_SWAPFILES 8
  29 
  30 #define SWP_USED        1
  31 #define SWP_WRITEOK     3
  32 
  33 int min_free_pages = 20;
  34 
  35 static int nr_swapfiles = 0;
  36 static struct wait_queue * lock_queue = NULL;
  37 static struct {
  38         int head;       /* head of priority-ordered swapfile list */
  39         int next;       /* swapfile to be used next */
  40 } swap_list = {-1, -1};
  41 
  42 static struct swap_info_struct {
  43         unsigned int flags;
  44         unsigned int swap_device;
  45         struct inode * swap_file;
  46         unsigned char * swap_map;
  47         unsigned char * swap_lockmap;
  48         int lowest_bit;
  49         int highest_bit;
  50         int prio;                       /* swap priority */
  51         int pages;
  52         unsigned long max;
  53         int next;                       /* next entry on swap list */
  54 } swap_info[MAX_SWAPFILES];
  55 
  56 extern int shm_swap (int, unsigned long);
  57 
  58 unsigned long *swap_cache;
  59 
  60 #ifdef SWAP_CACHE_INFO
  61 unsigned long swap_cache_add_total = 0;
  62 unsigned long swap_cache_add_success = 0;
  63 unsigned long swap_cache_del_total = 0;
  64 unsigned long swap_cache_del_success = 0;
  65 unsigned long swap_cache_find_total = 0;
  66 unsigned long swap_cache_find_success = 0;
  67 
  68 extern inline void show_swap_cache_info(void)
     /* [previous][next][first][last][top][bottom][index][help] */
  69 {
  70         printk("Swap cache: add %ld/%ld, delete %ld/%ld, find %ld/%ld\n",
  71                 swap_cache_add_total, swap_cache_add_success, 
  72                 swap_cache_del_total, swap_cache_del_success,
  73                 swap_cache_find_total, swap_cache_find_success);
  74 }
  75 #endif
  76 
  77 static int add_to_swap_cache(unsigned long addr, unsigned long entry)
     /* [previous][next][first][last][top][bottom][index][help] */
  78 {
  79         struct swap_info_struct * p = &swap_info[SWP_TYPE(entry)];
  80 
  81 #ifdef SWAP_CACHE_INFO
  82         swap_cache_add_total++;
  83 #endif
  84         if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
  85                 entry = (unsigned long) xchg_ptr(swap_cache + MAP_NR(addr), (void *) entry);
  86                 if (entry)  {
  87                         printk("swap_cache: replacing non-NULL entry\n");
  88                 }
  89 #ifdef SWAP_CACHE_INFO
  90                 swap_cache_add_success++;
  91 #endif
  92                 return 1;
  93         }
  94         return 0;
  95 }
  96 
  97 static unsigned long init_swap_cache(unsigned long mem_start,
     /* [previous][next][first][last][top][bottom][index][help] */
  98         unsigned long mem_end)
  99 {
 100         unsigned long swap_cache_size;
 101 
 102         mem_start = (mem_start + 15) & ~15;
 103         swap_cache = (unsigned long *) mem_start;
 104         swap_cache_size = MAP_NR(mem_end);
 105         memset(swap_cache, 0, swap_cache_size * sizeof (unsigned long));
 106         return (unsigned long) (swap_cache + swap_cache_size);
 107 }
 108 
 109 void rw_swap_page(int rw, unsigned long entry, char * buf)
     /* [previous][next][first][last][top][bottom][index][help] */
 110 {
 111         unsigned long type, offset;
 112         struct swap_info_struct * p;
 113 
 114         type = SWP_TYPE(entry);
 115         if (type >= nr_swapfiles) {
 116                 printk("Internal error: bad swap-device\n");
 117                 return;
 118         }
 119         p = &swap_info[type];
 120         offset = SWP_OFFSET(entry);
 121         if (offset >= p->max) {
 122                 printk("rw_swap_page: weirdness\n");
 123                 return;
 124         }
 125         if (p->swap_map && !p->swap_map[offset]) {
 126                 printk("Hmm.. Trying to use unallocated swap (%08lx)\n", entry);
 127                 return;
 128         }
 129         if (!(p->flags & SWP_USED)) {
 130                 printk("Trying to swap to unused swap-device\n");
 131                 return;
 132         }
 133         while (set_bit(offset,p->swap_lockmap))
 134                 sleep_on(&lock_queue);
 135         if (rw == READ)
 136                 kstat.pswpin++;
 137         else
 138                 kstat.pswpout++;
 139         if (p->swap_device) {
 140                 ll_rw_page(rw,p->swap_device,offset,buf);
 141         } else if (p->swap_file) {
 142                 struct inode *swapf = p->swap_file;
 143                 unsigned int zones[PAGE_SIZE/512];
 144                 int i;
 145                 if (swapf->i_op->bmap == NULL
 146                         && swapf->i_op->smap != NULL){
 147                         /*
 148                                 With MsDOS, we use msdos_smap which return
 149                                 a sector number (not a cluster or block number).
 150                                 It is a patch to enable the UMSDOS project.
 151                                 Other people are working on better solution.
 152 
 153                                 It sounds like ll_rw_swap_file defined
 154                                 it operation size (sector size) based on
 155                                 PAGE_SIZE and the number of block to read.
 156                                 So using bmap or smap should work even if
 157                                 smap will require more blocks.
 158                         */
 159                         int j;
 160                         unsigned int block = offset << 3;
 161 
 162                         for (i=0, j=0; j< PAGE_SIZE ; i++, j += 512){
 163                                 if (!(zones[i] = swapf->i_op->smap(swapf,block++))) {
 164                                         printk("rw_swap_page: bad swap file\n");
 165                                         return;
 166                                 }
 167                         }
 168                 }else{
 169                         int j;
 170                         unsigned int block = offset
 171                                 << (PAGE_SHIFT - swapf->i_sb->s_blocksize_bits);
 172 
 173                         for (i=0, j=0; j< PAGE_SIZE ; i++, j +=swapf->i_sb->s_blocksize)
 174                                 if (!(zones[i] = bmap(swapf,block++))) {
 175                                         printk("rw_swap_page: bad swap file\n");
 176                                         return;
 177                                 }
 178                 }
 179                 ll_rw_swap_file(rw,swapf->i_dev, zones, i,buf);
 180         } else
 181                 printk("re_swap_page: no swap file or device\n");
 182         if (offset && !clear_bit(offset,p->swap_lockmap))
 183                 printk("rw_swap_page: lock already cleared\n");
 184         wake_up(&lock_queue);
 185 }
 186 
 187 unsigned long get_swap_page(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 188 {
 189         struct swap_info_struct * p;
 190         unsigned long offset, entry;
 191         int type, wrapped = 0;
 192 
 193         type = swap_list.next;
 194         if (type < 0)
 195           return 0;
 196 
 197         while (1) {
 198                 p = &swap_info[type];
 199                 if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
 200                         for (offset = p->lowest_bit; offset <= p->highest_bit ; offset++) {
 201                                 if (p->swap_map[offset])
 202                                   continue;
 203                                 if (test_bit(offset, p->swap_lockmap))
 204                                   continue;
 205                                 p->swap_map[offset] = 1;
 206                                 nr_swap_pages--;
 207                                 if (offset == p->highest_bit)
 208                                   p->highest_bit--;
 209                                 p->lowest_bit = offset;
 210                                 entry = SWP_ENTRY(type,offset);
 211 
 212                                 type = swap_info[type].next;
 213                                 if (type < 0 || p->prio != swap_info[type].prio) {
 214                                     swap_list.next = swap_list.head;
 215                                 } else {
 216                                     swap_list.next = type;
 217                                 }
 218                                 return entry;
 219                         }
 220                 }
 221                 type = p->next;
 222                 if (!wrapped) {
 223                         if (type < 0 || p->prio != swap_info[type].prio) {
 224                                 type = swap_list.head;
 225                                 wrapped = 1;
 226                         }
 227                 } else if (type < 0) {
 228                         return 0;       /* out of swap space */
 229                 }
 230         }
 231 }
 232 
 233 void swap_duplicate(unsigned long entry)
     /* [previous][next][first][last][top][bottom][index][help] */
 234 {
 235         struct swap_info_struct * p;
 236         unsigned long offset, type;
 237 
 238         if (!entry)
 239                 return;
 240         offset = SWP_OFFSET(entry);
 241         type = SWP_TYPE(entry);
 242         if (type & SHM_SWP_TYPE)
 243                 return;
 244         if (type >= nr_swapfiles) {
 245                 printk("Trying to duplicate nonexistent swap-page\n");
 246                 return;
 247         }
 248         p = type + swap_info;
 249         if (offset >= p->max) {
 250                 printk("swap_duplicate: weirdness\n");
 251                 return;
 252         }
 253         if (!p->swap_map[offset]) {
 254                 printk("swap_duplicate: trying to duplicate unused page\n");
 255                 return;
 256         }
 257         p->swap_map[offset]++;
 258         return;
 259 }
 260 
 261 void swap_free(unsigned long entry)
     /* [previous][next][first][last][top][bottom][index][help] */
 262 {
 263         struct swap_info_struct * p;
 264         unsigned long offset, type;
 265 
 266         if (!entry)
 267                 return;
 268         type = SWP_TYPE(entry);
 269         if (type & SHM_SWP_TYPE)
 270                 return;
 271         if (type >= nr_swapfiles) {
 272                 printk("Trying to free nonexistent swap-page\n");
 273                 return;
 274         }
 275         p = & swap_info[type];
 276         offset = SWP_OFFSET(entry);
 277         if (offset >= p->max) {
 278                 printk("swap_free: weirdness\n");
 279                 return;
 280         }
 281         if (!(p->flags & SWP_USED)) {
 282                 printk("Trying to free swap from unused swap-device\n");
 283                 return;
 284         }
 285         if (offset < p->lowest_bit)
 286                 p->lowest_bit = offset;
 287         if (offset > p->highest_bit)
 288                 p->highest_bit = offset;
 289         if (!p->swap_map[offset])
 290                 printk("swap_free: swap-space map bad (entry %08lx)\n",entry);
 291         else
 292                 if (!--p->swap_map[offset])
 293                         nr_swap_pages++;
 294         if (p->prio > swap_info[swap_list.next].prio) {
 295             swap_list.next = swap_list.head;
 296         }
 297 }
 298 
 299 /*
 300  * The tests may look silly, but it essentially makes sure that
 301  * no other process did a swap-in on us just as we were waiting.
 302  *
 303  * Also, don't bother to add to the swap cache if this page-in
 304  * was due to a write access.
 305  */
 306 void swap_in(struct vm_area_struct * vma, pte_t * page_table,
     /* [previous][next][first][last][top][bottom][index][help] */
 307         unsigned long entry, int write_access)
 308 {
 309         unsigned long page = __get_free_page(GFP_KERNEL);
 310 
 311         if (pte_val(*page_table) != entry) {
 312                 free_page(page);
 313                 return;
 314         }
 315         if (!page) {
 316                 *page_table = BAD_PAGE;
 317                 swap_free(entry);
 318                 oom(current);
 319                 return;
 320         }
 321         read_swap_page(entry, (char *) page);
 322         if (pte_val(*page_table) != entry) {
 323                 free_page(page);
 324                 return;
 325         }
 326         vma->vm_task->mm->rss++;
 327         vma->vm_task->mm->maj_flt++;
 328         if (!write_access && add_to_swap_cache(page, entry)) {
 329                 *page_table = mk_pte(page, vma->vm_page_prot);
 330                 return;
 331         }
 332         *page_table = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 333         swap_free(entry);
 334         return;
 335 }
 336 
 337 /*
 338  * The swap-out functions return 1 if they successfully
 339  * threw something out, and we got a free page. It returns
 340  * zero if it couldn't do anything, and any other value
 341  * indicates it decreased rss, but the page was shared.
 342  *
 343  * NOTE! If it sleeps, it *must* return 1 to make sure we
 344  * don't continue with the swap-out. Otherwise we may be
 345  * using a process that no longer actually exists (it might
 346  * have died while we slept).
 347  */
 348 static inline int try_to_swap_out(struct vm_area_struct* vma, unsigned long address, pte_t * page_table, unsigned long limit)
     /* [previous][next][first][last][top][bottom][index][help] */
 349 {
 350         pte_t pte;
 351         unsigned long entry;
 352         unsigned long page;
 353 
 354         pte = *page_table;
 355         if (!pte_present(pte))
 356                 return 0;
 357         page = pte_page(pte);
 358         if (page >= high_memory)
 359                 return 0;
 360         if (page >= limit)
 361                 return 0;
 362         if (mem_map[MAP_NR(page)] & MAP_PAGE_RESERVED)
 363                 return 0;
 364         if ((pte_dirty(pte) && delete_from_swap_cache(page)) || pte_young(pte))  {
 365                 *page_table = pte_mkold(pte);
 366                 return 0;
 367         }       
 368         if (pte_dirty(pte)) {
 369                 if (mem_map[MAP_NR(page)] != 1)
 370                         return 0;
 371                 if (vma->vm_ops && vma->vm_ops->swapout) {
 372                         vma->vm_task->mm->rss--;
 373                         vma->vm_ops->swapout(vma, address-vma->vm_start, page_table);
 374                 } else {
 375                         if (!(entry = get_swap_page()))
 376                                 return 0;
 377                         vma->vm_task->mm->rss--;
 378                         pte_val(*page_table) = entry;
 379                         invalidate();
 380                         write_swap_page(entry, (char *) page);
 381                 }
 382                 free_page(page);
 383                 return 1;       /* we slept: the process may not exist any more */
 384         }
 385         if ((entry = find_in_swap_cache(page)))  {
 386                 if (mem_map[MAP_NR(page)] != 1) {
 387                         *page_table = pte_mkdirty(pte);
 388                         printk("Aiee.. duplicated cached swap-cache entry\n");
 389                         return 0;
 390                 }
 391                 vma->vm_task->mm->rss--;
 392                 pte_val(*page_table) = entry;
 393                 invalidate();
 394                 free_page(page);
 395                 return 1;
 396         } 
 397         vma->vm_task->mm->rss--;
 398         pte_clear(page_table);
 399         invalidate();
 400         entry = mem_map[MAP_NR(page)];
 401         free_page(page);
 402         return entry;
 403 }
 404 
 405 /*
 406  * A new implementation of swap_out().  We do not swap complete processes,
 407  * but only a small number of blocks, before we continue with the next
 408  * process.  The number of blocks actually swapped is determined on the
 409  * number of page faults, that this process actually had in the last time,
 410  * so we won't swap heavily used processes all the time ...
 411  *
 412  * Note: the priority argument is a hint on much CPU to waste with the
 413  *       swap block search, not a hint, of how much blocks to swap with
 414  *       each process.
 415  *
 416  * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
 417  */
 418 
 419 /*
 420  * These are the minimum and maximum number of pages to swap from one process,
 421  * before proceeding to the next:
 422  */
 423 #define SWAP_MIN        4
 424 #define SWAP_MAX        32
 425 
 426 /*
 427  * The actual number of pages to swap is determined as:
 428  * SWAP_RATIO / (number of recent major page faults)
 429  */
 430 #define SWAP_RATIO      128
 431 
 432 static inline int swap_out_pmd(struct vm_area_struct * vma, pmd_t *dir,
     /* [previous][next][first][last][top][bottom][index][help] */
 433         unsigned long address, unsigned long end, unsigned long limit)
 434 {
 435         pte_t * pte;
 436         unsigned long pmd_end;
 437 
 438         if (pmd_none(*dir))
 439                 return 0;
 440         if (pmd_bad(*dir)) {
 441                 printk("swap_out_pmd: bad pmd (%08lx)\n", pmd_val(*dir));
 442                 pmd_clear(dir);
 443                 return 0;
 444         }
 445         
 446         pte = pte_offset(dir, address);
 447         
 448         pmd_end = (address + PMD_SIZE) & PMD_MASK;
 449         if (end > pmd_end)
 450                 end = pmd_end;
 451 
 452         do {
 453                 int result;
 454                 vma->vm_task->mm->swap_address = address + PAGE_SIZE;
 455                 result = try_to_swap_out(vma, address, pte, limit);
 456                 if (result)
 457                         return result;
 458                 address += PAGE_SIZE;
 459                 pte++;
 460         } while (address < end);
 461         return 0;
 462 }
 463 
 464 static inline int swap_out_pgd(struct vm_area_struct * vma, pgd_t *dir,
     /* [previous][next][first][last][top][bottom][index][help] */
 465         unsigned long address, unsigned long end, unsigned long limit)
 466 {
 467         pmd_t * pmd;
 468         unsigned long pgd_end;
 469 
 470         if (pgd_none(*dir))
 471                 return 0;
 472         if (pgd_bad(*dir)) {
 473                 printk("swap_out_pgd: bad pgd (%08lx)\n", pgd_val(*dir));
 474                 pgd_clear(dir);
 475                 return 0;
 476         }
 477 
 478         pmd = pmd_offset(dir, address);
 479 
 480         pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;  
 481         if (end > pgd_end)
 482                 end = pgd_end;
 483         
 484         do {
 485                 int result = swap_out_pmd(vma, pmd, address, end, limit);
 486                 if (result)
 487                         return result;
 488                 address = (address + PMD_SIZE) & PMD_MASK;
 489                 pmd++;
 490         } while (address < end);
 491         return 0;
 492 }
 493 
 494 static int swap_out_vma(struct vm_area_struct * vma, pgd_t *pgdir,
     /* [previous][next][first][last][top][bottom][index][help] */
 495         unsigned long start, unsigned long limit)
 496 {
 497         unsigned long end;
 498 
 499         /* Don't swap out areas like shared memory which have their
 500             own separate swapping mechanism. */
 501         if (vma->vm_flags & VM_SHM)
 502                 return 0;
 503 
 504         end = vma->vm_end;
 505         while (start < end) {
 506                 int result = swap_out_pgd(vma, pgdir, start, end, limit);
 507                 if (result)
 508                         return result;
 509                 start = (start + PGDIR_SIZE) & PGDIR_MASK;
 510                 pgdir++;
 511         }
 512         return 0;
 513 }
 514 
 515 static int swap_out_process(struct task_struct * p, unsigned long limit)
     /* [previous][next][first][last][top][bottom][index][help] */
 516 {
 517         unsigned long address;
 518         struct vm_area_struct* vma;
 519 
 520         /*
 521          * Go through process' page directory.
 522          */
 523         address = p->mm->swap_address;
 524         p->mm->swap_address = 0;
 525 
 526         /*
 527          * Find the proper vm-area
 528          */
 529         vma = find_vma(p, address);
 530         if (!vma)
 531                 return 0;
 532         if (address < vma->vm_start)
 533                 address = vma->vm_start;
 534 
 535         for (;;) {
 536                 int result = swap_out_vma(vma, pgd_offset(p, address), address, limit);
 537                 if (result)
 538                         return result;
 539                 vma = vma->vm_next;
 540                 if (!vma)
 541                         break;
 542                 address = vma->vm_start;
 543         }
 544         p->mm->swap_address = 0;
 545         return 0;
 546 }
 547 
 548 static int swap_out(unsigned int priority, unsigned long limit)
     /* [previous][next][first][last][top][bottom][index][help] */
 549 {
 550         static int swap_task;
 551         int loop, counter;
 552         struct task_struct *p;
 553 
 554         counter = 6*nr_tasks >> priority;
 555         for(; counter >= 0; counter--) {
 556                 /*
 557                  * Check that swap_task is suitable for swapping.  If not, look for
 558                  * the next suitable process.
 559                  */
 560                 loop = 0;
 561                 while(1) {
 562                         if (swap_task >= NR_TASKS) {
 563                                 swap_task = 1;
 564                                 if (loop)
 565                                         /* all processes are unswappable or already swapped out */
 566                                         return 0;
 567                                 loop = 1;
 568                         }
 569 
 570                         p = task[swap_task];
 571                         if (p && p->mm->swappable && p->mm->rss)
 572                                 break;
 573 
 574                         swap_task++;
 575                 }
 576 
 577                 /*
 578                  * Determine the number of pages to swap from this process.
 579                  */
 580                 if (!p->mm->swap_cnt) {
 581                         p->mm->dec_flt = (p->mm->dec_flt * 3) / 4 + p->mm->maj_flt - p->mm->old_maj_flt;
 582                         p->mm->old_maj_flt = p->mm->maj_flt;
 583 
 584                         if (p->mm->dec_flt >= SWAP_RATIO / SWAP_MIN) {
 585                                 p->mm->dec_flt = SWAP_RATIO / SWAP_MIN;
 586                                 p->mm->swap_cnt = SWAP_MIN;
 587                         } else if (p->mm->dec_flt <= SWAP_RATIO / SWAP_MAX)
 588                                 p->mm->swap_cnt = SWAP_MAX;
 589                         else
 590                                 p->mm->swap_cnt = SWAP_RATIO / p->mm->dec_flt;
 591                 }
 592                 if (!--p->mm->swap_cnt)
 593                         swap_task++;
 594                 switch (swap_out_process(p, limit)) {
 595                         case 0:
 596                                 if (p->mm->swap_cnt)
 597                                         swap_task++;
 598                                 break;
 599                         case 1:
 600                                 return 1;
 601                         default:
 602                                 break;
 603                 }
 604         }
 605         return 0;
 606 }
 607 
 608 /*
 609  * we keep on shrinking one resource until it's considered "too hard",
 610  * and then switch to the next one (priority being an indication on how
 611  * hard we should try with the resource).
 612  *
 613  * This should automatically find the resource that can most easily be
 614  * free'd, so hopefully we'll get reasonable behaviour even under very
 615  * different circumstances.
 616  */
 617 static int try_to_free_page(int priority, unsigned long limit)
     /* [previous][next][first][last][top][bottom][index][help] */
 618 {
 619         static int state = 0;
 620         int i=6;
 621 
 622         switch (state) {
 623                 do {
 624                 case 0:
 625                         if (priority != GFP_NOBUFFER && shrink_buffers(i, limit))
 626                                 return 1;
 627                         state = 1;
 628                 case 1:
 629                         if (shm_swap(i, limit))
 630                                 return 1;
 631                         state = 2;
 632                 default:
 633                         if (swap_out(i, limit))
 634                                 return 1;
 635                         state = 0;
 636                 } while(i--);
 637         }
 638         return 0;
 639 }
 640 
 641 static inline void add_mem_queue(struct mem_list * head, struct mem_list * entry)
     /* [previous][next][first][last][top][bottom][index][help] */
 642 {
 643         entry->prev = head;
 644         (entry->next = head->next)->prev = entry;
 645         head->next = entry;
 646 }
 647 
 648 static inline void remove_mem_queue(struct mem_list * head, struct mem_list * entry)
     /* [previous][next][first][last][top][bottom][index][help] */
 649 {
 650         entry->next->prev = entry->prev;
 651         entry->prev->next = entry->next;
 652 }
 653 
 654 /*
 655  * Free_page() adds the page to the free lists. This is optimized for
 656  * fast normal cases (no error jumps taken normally).
 657  *
 658  * The way to optimize jumps for gcc-2.2.2 is to:
 659  *  - select the "normal" case and put it inside the if () { XXX }
 660  *  - no else-statements if you can avoid them
 661  *
 662  * With the above two rules, you get a straight-line execution path
 663  * for the normal case, giving better asm-code.
 664  *
 665  * free_page() may sleep since the page being freed may be a buffer
 666  * page or present in the swap cache. It will not sleep, however,
 667  * for a freshly allocated page (get_free_page()).
 668  */
 669 
 670 /*
 671  * Buddy system. Hairy. You really aren't expected to understand this
 672  */
 673 static inline void free_pages_ok(unsigned long addr, unsigned long order)
     /* [previous][next][first][last][top][bottom][index][help] */
 674 {
 675         unsigned long index = MAP_NR(addr) >> (1 + order);
 676         unsigned long mask = PAGE_MASK << order;
 677 
 678         addr &= mask;
 679         nr_free_pages += 1 << order;
 680         while (order < NR_MEM_LISTS-1) {
 681                 if (!change_bit(index, free_area_map[order]))
 682                         break;
 683                 remove_mem_queue(free_area_list+order, (struct mem_list *) (addr ^ (1+~mask)));
 684                 order++;
 685                 index >>= 1;
 686                 mask <<= 1;
 687                 addr &= mask;
 688         }
 689         add_mem_queue(free_area_list+order, (struct mem_list *) addr);
 690 }
 691 
 692 static inline void check_free_buffers(unsigned long addr)
     /* [previous][next][first][last][top][bottom][index][help] */
 693 {
 694         struct buffer_head * bh;
 695 
 696         bh = buffer_pages[MAP_NR(addr)];
 697         if (bh) {
 698                 struct buffer_head *tmp = bh;
 699                 do {
 700                         if (tmp->b_list == BUF_SHARED && tmp->b_dev != 0xffff)
 701                                 refile_buffer(tmp);
 702                         tmp = tmp->b_this_page;
 703                 } while (tmp != bh);
 704         }
 705 }
 706 
 707 void free_pages(unsigned long addr, unsigned long order)
     /* [previous][next][first][last][top][bottom][index][help] */
 708 {
 709         if (addr < high_memory) {
 710                 unsigned long flag;
 711                 mem_map_t * map = mem_map + MAP_NR(addr);
 712                 if (*map) {
 713                         if (!(*map & MAP_PAGE_RESERVED)) {
 714                                 save_flags(flag);
 715                                 cli();
 716                                 if (!--*map)  {
 717                                         free_pages_ok(addr, order);
 718                                         delete_from_swap_cache(addr);
 719                                 }
 720                                 restore_flags(flag);
 721                                 if (*map == 1)
 722                                         check_free_buffers(addr);
 723                         }
 724                         return;
 725                 }
 726                 printk("Trying to free free memory (%08lx): memory probably corrupted\n",addr);
 727                 printk("PC = %p\n", __builtin_return_address(0));
 728                 return;
 729         }
 730 }
 731 
 732 /*
 733  * Some ugly macros to speed up __get_free_pages()..
 734  */
 735 #define RMQUEUE(order, limit) \
 736 do { struct mem_list * queue = free_area_list+order; \
 737      unsigned long new_order = order; \
 738         do { struct mem_list *prev = queue, *ret; \
 739                 while (queue != (ret = prev->next)) { \
 740                         if ((unsigned long) ret < (limit)) { \
 741                                 (prev->next = ret->next)->prev = prev; \
 742                                 mark_used((unsigned long) ret, new_order); \
 743                                 nr_free_pages -= 1 << order; \
 744                                 restore_flags(flags); \
 745                                 EXPAND(ret, order, new_order); \
 746                                 return (unsigned long) ret; \
 747                         } \
 748                         prev = ret; \
 749                 } \
 750                 new_order++; queue++; \
 751         } while (new_order < NR_MEM_LISTS); \
 752 } while (0)
 753 
 754 static inline int mark_used(unsigned long addr, unsigned long order)
     /* [previous][next][first][last][top][bottom][index][help] */
 755 {
 756         return change_bit(MAP_NR(addr) >> (1+order), free_area_map[order]);
 757 }
 758 
 759 #define EXPAND(addr,low,high) \
 760 do { unsigned long size = PAGE_SIZE << high; \
 761         while (high > low) { \
 762                 high--; size >>= 1; cli(); \
 763                 add_mem_queue(free_area_list+high, addr); \
 764                 mark_used((unsigned long) addr, high); \
 765                 restore_flags(flags); \
 766                 addr = (struct mem_list *) (size + (unsigned long) addr); \
 767         } mem_map[MAP_NR((unsigned long) addr)] = 1; \
 768 } while (0)
 769 
 770 unsigned long __get_free_pages(int priority, unsigned long order, unsigned long limit)
     /* [previous][next][first][last][top][bottom][index][help] */
 771 {
 772         unsigned long flags;
 773         int reserved_pages;
 774 
 775         if (intr_count && priority != GFP_ATOMIC) {
 776                 static int count = 0;
 777                 if (++count < 5) {
 778                         printk("gfp called nonatomically from interrupt %p\n",
 779                                 __builtin_return_address(0));
 780                         priority = GFP_ATOMIC;
 781                 }
 782         }
 783         reserved_pages = 5;
 784         if (priority != GFP_NFS)
 785                 reserved_pages = min_free_pages;
 786         save_flags(flags);
 787 repeat:
 788         cli();
 789         if ((priority==GFP_ATOMIC) || nr_free_pages > reserved_pages) {
 790                 RMQUEUE(order, limit);
 791                 restore_flags(flags);
 792                 return 0;
 793         }
 794         restore_flags(flags);
 795         if (priority != GFP_BUFFER && try_to_free_page(priority, limit))
 796                 goto repeat;
 797         return 0;
 798 }
 799 
 800 /*
 801  * Show free area list (used inside shift_scroll-lock stuff)
 802  * We also calculate the percentage fragmentation. We do this by counting the
 803  * memory on each free list with the exception of the first item on the list.
 804  */
 805 void show_free_areas(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 806 {
 807         unsigned long order, flags;
 808         unsigned long total = 0;
 809 
 810         printk("Free pages:      %6dkB\n ( ",nr_free_pages<<(PAGE_SHIFT-10));
 811         save_flags(flags);
 812         cli();
 813         for (order=0 ; order < NR_MEM_LISTS; order++) {
 814                 struct mem_list * tmp;
 815                 unsigned long nr = 0;
 816                 for (tmp = free_area_list[order].next ; tmp != free_area_list + order ; tmp = tmp->next) {
 817                         nr ++;
 818                 }
 819                 total += nr * ((PAGE_SIZE>>10) << order);
 820                 printk("%lu*%lukB ", nr, (PAGE_SIZE>>10) << order);
 821         }
 822         restore_flags(flags);
 823         printk("= %lukB)\n", total);
 824 #ifdef SWAP_CACHE_INFO
 825         show_swap_cache_info();
 826 #endif  
 827 }
 828 
 829 /*
 830  * Trying to stop swapping from a file is fraught with races, so
 831  * we repeat quite a bit here when we have to pause. swapoff()
 832  * isn't exactly timing-critical, so who cares (but this is /really/
 833  * inefficient, ugh).
 834  *
 835  * We return 1 after having slept, which makes the process start over
 836  * from the beginning for this process..
 837  */
 838 static inline int unuse_pte(struct vm_area_struct * vma, unsigned long address,
     /* [previous][next][first][last][top][bottom][index][help] */
 839         pte_t *dir, unsigned int type, unsigned long page)
 840 {
 841         pte_t pte = *dir;
 842 
 843         if (pte_none(pte))
 844                 return 0;
 845         if (pte_present(pte)) {
 846                 unsigned long page = pte_page(pte);
 847                 if (page >= high_memory)
 848                         return 0;
 849                 if (!in_swap_cache(page))
 850                         return 0;
 851                 if (SWP_TYPE(in_swap_cache(page)) != type)
 852                         return 0;
 853                 delete_from_swap_cache(page);
 854                 *dir = pte_mkdirty(pte);
 855                 return 0;
 856         }
 857         if (SWP_TYPE(pte_val(pte)) != type)
 858                 return 0;
 859         read_swap_page(pte_val(pte), (char *) page);
 860         if (pte_val(*dir) != pte_val(pte)) {
 861                 free_page(page);
 862                 return 1;
 863         }
 864         *dir = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 865         ++vma->vm_task->mm->rss;
 866         swap_free(pte_val(pte));
 867         return 1;
 868 }
 869 
 870 static inline int unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
     /* [previous][next][first][last][top][bottom][index][help] */
 871         unsigned long address, unsigned long size, unsigned long offset,
 872         unsigned int type, unsigned long page)
 873 {
 874         pte_t * pte;
 875         unsigned long end;
 876 
 877         if (pmd_none(*dir))
 878                 return 0;
 879         if (pmd_bad(*dir)) {
 880                 printk("unuse_pmd: bad pmd (%08lx)\n", pmd_val(*dir));
 881                 pmd_clear(dir);
 882                 return 0;
 883         }
 884         pte = pte_offset(dir, address);
 885         offset += address & PMD_MASK;
 886         address &= ~PMD_MASK;
 887         end = address + size;
 888         if (end > PMD_SIZE)
 889                 end = PMD_SIZE;
 890         do {
 891                 if (unuse_pte(vma, offset+address-vma->vm_start, pte, type, page))
 892                         return 1;
 893                 address += PAGE_SIZE;
 894                 pte++;
 895         } while (address < end);
 896         return 0;
 897 }
 898 
 899 static inline int unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
     /* [previous][next][first][last][top][bottom][index][help] */
 900         unsigned long address, unsigned long size,
 901         unsigned int type, unsigned long page)
 902 {
 903         pmd_t * pmd;
 904         unsigned long offset, end;
 905 
 906         if (pgd_none(*dir))
 907                 return 0;
 908         if (pgd_bad(*dir)) {
 909                 printk("unuse_pgd: bad pgd (%08lx)\n", pgd_val(*dir));
 910                 pgd_clear(dir);
 911                 return 0;
 912         }
 913         pmd = pmd_offset(dir, address);
 914         offset = address & PGDIR_MASK;
 915         address &= ~PGDIR_MASK;
 916         end = address + size;
 917         if (end > PGDIR_SIZE)
 918                 end = PGDIR_SIZE;
 919         do {
 920                 if (unuse_pmd(vma, pmd, address, end - address, offset, type, page))
 921                         return 1;
 922                 address = (address + PMD_SIZE) & PMD_MASK;
 923                 pmd++;
 924         } while (address < end);
 925         return 0;
 926 }
 927 
 928 static int unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir,
     /* [previous][next][first][last][top][bottom][index][help] */
 929         unsigned long start, unsigned long end,
 930         unsigned int type, unsigned long page)
 931 {
 932         while (start < end) {
 933                 if (unuse_pgd(vma, pgdir, start, end - start, type, page))
 934                         return 1;
 935                 start = (start + PGDIR_SIZE) & PGDIR_MASK;
 936                 pgdir++;
 937         }
 938         return 0;
 939 }
 940 
 941 static int unuse_process(struct task_struct * p, unsigned int type, unsigned long page)
     /* [previous][next][first][last][top][bottom][index][help] */
 942 {
 943         struct vm_area_struct* vma;
 944 
 945         /*
 946          * Go through process' page directory.
 947          */
 948         vma = p->mm->mmap;
 949         while (vma) {
 950                 pgd_t * pgd = pgd_offset(p, vma->vm_start);
 951                 if (unuse_vma(vma, pgd, vma->vm_start, vma->vm_end, type, page))
 952                         return 1;
 953                 vma = vma->vm_next;
 954         }
 955         return 0;
 956 }
 957 
 958 /*
 959  * To avoid races, we repeat for each process after having
 960  * swapped something in. That gets rid of a few pesky races,
 961  * and "swapoff" isn't exactly timing critical.
 962  */
 963 static int try_to_unuse(unsigned int type)
     /* [previous][next][first][last][top][bottom][index][help] */
 964 {
 965         int nr;
 966         unsigned long page = get_free_page(GFP_KERNEL);
 967 
 968         if (!page)
 969                 return -ENOMEM;
 970         nr = 0;
 971         while (nr < NR_TASKS) {
 972                 if (task[nr]) {
 973                         if (unuse_process(task[nr], type, page)) {
 974                                 page = get_free_page(GFP_KERNEL);
 975                                 if (!page)
 976                                         return -ENOMEM;
 977                                 continue;
 978                         }
 979                 }
 980                 nr++;
 981         }
 982         free_page(page);
 983         return 0;
 984 }
 985 
 986 asmlinkage int sys_swapoff(const char * specialfile)
     /* [previous][next][first][last][top][bottom][index][help] */
 987 {
 988         struct swap_info_struct * p;
 989         struct inode * inode;
 990         struct file filp;
 991         int i, type, prev;
 992 
 993         if (!suser())
 994                 return -EPERM;
 995         i = namei(specialfile,&inode);
 996         if (i)
 997                 return i;
 998         prev = -1;
 999         for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
1000                 p = swap_info + type;
1001                 if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
1002                         if (p->swap_file) {
1003                                 if (p->swap_file == inode)
1004                                   break;
1005                         } else {
1006                                 if (S_ISBLK(inode->i_mode)
1007                                     && (p->swap_device == inode->i_rdev))
1008                                   break;
1009                         }
1010                 }
1011                 prev = type;
1012         }
1013         if (type < 0){
1014                 iput(inode);
1015                 return -EINVAL;
1016         }
1017         if (prev < 0) {
1018                 swap_list.head = p->next;
1019         } else {
1020                 swap_info[prev].next = p->next;
1021         }
1022         if (type == swap_list.next) {
1023                 /* just pick something that's safe... */
1024                 swap_list.next = swap_list.head;
1025         }
1026         p->flags = SWP_USED;
1027         i = try_to_unuse(type);
1028         if (i) {
1029                 iput(inode);
1030                 p->flags = SWP_WRITEOK;
1031                 return i;
1032         }
1033 
1034         if(p->swap_device){
1035                 memset(&filp, 0, sizeof(filp));         
1036                 filp.f_inode = inode;
1037                 filp.f_mode = 3; /* read write */
1038                 /* open it again to get fops */
1039                 if( !blkdev_open(inode, &filp) &&
1040                    filp.f_op && filp.f_op->release){
1041                         filp.f_op->release(inode,&filp);
1042                         filp.f_op->release(inode,&filp);
1043                 }
1044         }
1045         iput(inode);
1046 
1047         nr_swap_pages -= p->pages;
1048         iput(p->swap_file);
1049         p->swap_file = NULL;
1050         p->swap_device = 0;
1051         vfree(p->swap_map);
1052         p->swap_map = NULL;
1053         free_page((long) p->swap_lockmap);
1054         p->swap_lockmap = NULL;
1055         p->flags = 0;
1056         return 0;
1057 }
1058 
1059 /*
1060  * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
1061  *
1062  * The swapon system call
1063  */
1064 asmlinkage int sys_swapon(const char * specialfile, int swap_flags)
     /* [previous][next][first][last][top][bottom][index][help] */
1065 {
1066         struct swap_info_struct * p;
1067         struct inode * swap_inode;
1068         unsigned int type;
1069         int i, j, prev;
1070         int error;
1071         struct file filp;
1072         static int least_priority = 0;
1073 
1074         memset(&filp, 0, sizeof(filp));
1075         if (!suser())
1076                 return -EPERM;
1077         p = swap_info;
1078         for (type = 0 ; type < nr_swapfiles ; type++,p++)
1079                 if (!(p->flags & SWP_USED))
1080                         break;
1081         if (type >= MAX_SWAPFILES)
1082                 return -EPERM;
1083         if (type >= nr_swapfiles)
1084                 nr_swapfiles = type+1;
1085         p->flags = SWP_USED;
1086         p->swap_file = NULL;
1087         p->swap_device = 0;
1088         p->swap_map = NULL;
1089         p->swap_lockmap = NULL;
1090         p->lowest_bit = 0;
1091         p->highest_bit = 0;
1092         p->max = 1;
1093         p->next = -1;
1094         if (swap_flags & SWAP_FLAG_PREFER) {
1095                 p->prio =
1096                   (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT;
1097         } else {
1098                 p->prio = --least_priority;
1099         }
1100         error = namei(specialfile,&swap_inode);
1101         if (error)
1102                 goto bad_swap_2;
1103         p->swap_file = swap_inode;
1104         error = -EBUSY;
1105         if (swap_inode->i_count != 1)
1106                 goto bad_swap_2;
1107         error = -EINVAL;
1108 
1109         if (S_ISBLK(swap_inode->i_mode)) {
1110                 p->swap_device = swap_inode->i_rdev;
1111 
1112                 filp.f_inode = swap_inode;
1113                 filp.f_mode = 3; /* read write */
1114                 error = blkdev_open(swap_inode, &filp);
1115                 p->swap_file = NULL;
1116                 iput(swap_inode);
1117                 if(error)
1118                         goto bad_swap_2;
1119                 error = -ENODEV;
1120                 if (!p->swap_device)
1121                         goto bad_swap;
1122                 error = -EBUSY;
1123                 for (i = 0 ; i < nr_swapfiles ; i++) {
1124                         if (i == type)
1125                                 continue;
1126                         if (p->swap_device == swap_info[i].swap_device)
1127                                 goto bad_swap;
1128                 }
1129         } else if (!S_ISREG(swap_inode->i_mode))
1130                 goto bad_swap;
1131         p->swap_lockmap = (unsigned char *) get_free_page(GFP_USER);
1132         if (!p->swap_lockmap) {
1133                 printk("Unable to start swapping: out of memory :-)\n");
1134                 error = -ENOMEM;
1135                 goto bad_swap;
1136         }
1137         read_swap_page(SWP_ENTRY(type,0), (char *) p->swap_lockmap);
1138         if (memcmp("SWAP-SPACE",p->swap_lockmap+PAGE_SIZE-10,10)) {
1139                 printk("Unable to find swap-space signature\n");
1140                 error = -EINVAL;
1141                 goto bad_swap;
1142         }
1143         memset(p->swap_lockmap+PAGE_SIZE-10,0,10);
1144         j = 0;
1145         p->lowest_bit = 0;
1146         p->highest_bit = 0;
1147         for (i = 1 ; i < 8*PAGE_SIZE ; i++) {
1148                 if (test_bit(i,p->swap_lockmap)) {
1149                         if (!p->lowest_bit)
1150                                 p->lowest_bit = i;
1151                         p->highest_bit = i;
1152                         p->max = i+1;
1153                         j++;
1154                 }
1155         }
1156         if (!j) {
1157                 printk("Empty swap-file\n");
1158                 error = -EINVAL;
1159                 goto bad_swap;
1160         }
1161         p->swap_map = (unsigned char *) vmalloc(p->max);
1162         if (!p->swap_map) {
1163                 error = -ENOMEM;
1164                 goto bad_swap;
1165         }
1166         for (i = 1 ; i < p->max ; i++) {
1167                 if (test_bit(i,p->swap_lockmap))
1168                         p->swap_map[i] = 0;
1169                 else
1170                         p->swap_map[i] = 0x80;
1171         }
1172         p->swap_map[0] = 0x80;
1173         memset(p->swap_lockmap,0,PAGE_SIZE);
1174         p->flags = SWP_WRITEOK;
1175         p->pages = j;
1176         nr_swap_pages += j;
1177         printk("Adding Swap: %dk swap-space\n",j<<(PAGE_SHIFT-10));
1178 
1179         /* insert swap space into swap_list: */
1180         prev = -1;
1181         for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
1182                 if (p->prio >= swap_info[i].prio) {
1183                         break;
1184                 }
1185                 prev = i;
1186         }
1187         p->next = i;
1188         if (prev < 0) {
1189                 swap_list.head = swap_list.next = p - swap_info;
1190         } else {
1191                 swap_info[prev].next = p - swap_info;
1192         }
1193         return 0;
1194 bad_swap:
1195         if(filp.f_op && filp.f_op->release)
1196                 filp.f_op->release(filp.f_inode,&filp);
1197 bad_swap_2:
1198         free_page((long) p->swap_lockmap);
1199         vfree(p->swap_map);
1200         iput(p->swap_file);
1201         p->swap_device = 0;
1202         p->swap_file = NULL;
1203         p->swap_map = NULL;
1204         p->swap_lockmap = NULL;
1205         p->flags = 0;
1206         return error;
1207 }
1208 
1209 void si_swapinfo(struct sysinfo *val)
     /* [previous][next][first][last][top][bottom][index][help] */
1210 {
1211         unsigned int i, j;
1212 
1213         val->freeswap = val->totalswap = 0;
1214         for (i = 0; i < nr_swapfiles; i++) {
1215                 if ((swap_info[i].flags & SWP_WRITEOK) != SWP_WRITEOK)
1216                         continue;
1217                 for (j = 0; j < swap_info[i].max; ++j)
1218                         switch (swap_info[i].swap_map[j]) {
1219                                 case 128:
1220                                         continue;
1221                                 case 0:
1222                                         ++val->freeswap;
1223                                 default:
1224                                         ++val->totalswap;
1225                         }
1226         }
1227         val->freeswap <<= PAGE_SHIFT;
1228         val->totalswap <<= PAGE_SHIFT;
1229         return;
1230 }
1231 
1232 /*
1233  * set up the free-area data structures:
1234  *   - mark all pages MAP_PAGE_RESERVED
1235  *   - mark all memory queues empty
1236  *   - clear the memory bitmaps
1237  */
1238 unsigned long free_area_init(unsigned long start_mem, unsigned long end_mem)
     /* [previous][next][first][last][top][bottom][index][help] */
1239 {
1240         mem_map_t * p;
1241         unsigned long mask = PAGE_MASK;
1242         int i;
1243 
1244         /*
1245          * select nr of pages we try to keep free for important stuff
1246          * with a minimum of 16 pages. This is totally arbitrary
1247          */
1248         i = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT+6);
1249         if (i < 16)
1250                 i = 16;
1251         min_free_pages = i;
1252         start_mem = init_swap_cache(start_mem, end_mem);
1253         mem_map = (mem_map_t *) start_mem;
1254         p = mem_map + MAP_NR(end_mem);
1255         start_mem = (unsigned long) p;
1256         while (p > mem_map)
1257                 *--p = MAP_PAGE_RESERVED;
1258 
1259         for (i = 0 ; i < NR_MEM_LISTS ; i++) {
1260                 unsigned long bitmap_size;
1261                 free_area_list[i].prev = free_area_list[i].next = &free_area_list[i];
1262                 mask += mask;
1263                 end_mem = (end_mem + ~mask) & mask;
1264                 bitmap_size = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT + i);
1265                 bitmap_size = (bitmap_size + 7) >> 3;
1266                 bitmap_size = (bitmap_size + sizeof(unsigned long) - 1) & ~(sizeof(unsigned long)-1);
1267                 free_area_map[i] = (unsigned char *) start_mem;
1268                 memset((void *) start_mem, 0, bitmap_size);
1269                 start_mem += bitmap_size;
1270         }
1271         return start_mem;
1272 }

/* [previous][next][first][last][top][bottom][index][help] */