root/mm/swap.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. show_swap_cache_info
  2. add_to_swap_cache
  3. init_swap_cache
  4. rw_swap_page
  5. get_swap_page
  6. swap_duplicate
  7. swap_free
  8. swap_in
  9. try_to_swap_out
  10. swap_out_process
  11. swap_out
  12. try_to_free_page
  13. add_mem_queue
  14. remove_mem_queue
  15. free_pages_ok
  16. check_free_buffers
  17. free_pages
  18. mark_used
  19. __get_free_pages
  20. __get_dma_pages
  21. show_free_areas
  22. try_to_unuse
  23. sys_swapoff
  24. sys_swapon
  25. si_swapinfo
  26. free_area_init

   1 /*
   2  *  linux/mm/swap.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  */
   6 
   7 /*
   8  * This file should contain most things doing the swapping from/to disk.
   9  * Started 18.12.91
  10  */
  11 
  12 #include <linux/mm.h>
  13 #include <linux/sched.h>
  14 #include <linux/head.h>
  15 #include <linux/kernel.h>
  16 #include <linux/kernel_stat.h>
  17 #include <linux/errno.h>
  18 #include <linux/string.h>
  19 #include <linux/stat.h>
  20 #include <linux/fs.h>
  21 
  22 #include <asm/dma.h>
  23 #include <asm/system.h> /* for cli()/sti() */
  24 #include <asm/bitops.h>
  25 #include <asm/pgtable.h>
  26 
  27 #define MAX_SWAPFILES 8
  28 
  29 #define SWP_USED        1
  30 #define SWP_WRITEOK     3
  31 
  32 #define SWP_TYPE(entry) (((entry) >> 1) & 0x7f)
  33 #define SWP_OFFSET(entry) ((entry) >> 12)
  34 #define SWP_ENTRY(type,offset) (((type) << 1) | ((offset) << 12))
  35 
  36 int min_free_pages = 20;
  37 
  38 static int nr_swapfiles = 0;
  39 static struct wait_queue * lock_queue = NULL;
  40 
  41 static struct swap_info_struct {
  42         unsigned long flags;
  43         struct inode * swap_file;
  44         unsigned int swap_device;
  45         unsigned char * swap_map;
  46         unsigned char * swap_lockmap;
  47         int pages;
  48         int lowest_bit;
  49         int highest_bit;
  50         unsigned long max;
  51 } swap_info[MAX_SWAPFILES];
  52 
  53 extern int shm_swap (int);
  54 
  55 unsigned long *swap_cache;
  56 
  57 #ifdef SWAP_CACHE_INFO
  58 unsigned long swap_cache_add_total = 0;
  59 unsigned long swap_cache_add_success = 0;
  60 unsigned long swap_cache_del_total = 0;
  61 unsigned long swap_cache_del_success = 0;
  62 unsigned long swap_cache_find_total = 0;
  63 unsigned long swap_cache_find_success = 0;
  64 
  65 extern inline void show_swap_cache_info(void)
     /* [previous][next][first][last][top][bottom][index][help] */
  66 {
  67         printk("Swap cache: add %ld/%ld, delete %ld/%ld, find %ld/%ld\n",
  68                 swap_cache_add_total, swap_cache_add_success, 
  69                 swap_cache_del_total, swap_cache_del_success,
  70                 swap_cache_find_total, swap_cache_find_success);
  71 }
  72 #endif
  73 
  74 static int add_to_swap_cache(unsigned long addr, unsigned long entry)
     /* [previous][next][first][last][top][bottom][index][help] */
  75 {
  76         struct swap_info_struct * p = &swap_info[SWP_TYPE(entry)];
  77 
  78 #ifdef SWAP_CACHE_INFO
  79         swap_cache_add_total++;
  80 #endif
  81         if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
  82                 entry = (unsigned long) xchg_ptr(swap_cache + MAP_NR(addr), (void *) entry);
  83                 if (entry)  {
  84                         printk("swap_cache: replacing non-NULL entry\n");
  85                 }
  86 #ifdef SWAP_CACHE_INFO
  87                 swap_cache_add_success++;
  88 #endif
  89                 return 1;
  90         }
  91         return 0;
  92 }
  93 
  94 static unsigned long init_swap_cache(unsigned long mem_start,
     /* [previous][next][first][last][top][bottom][index][help] */
  95         unsigned long mem_end)
  96 {
  97         unsigned long swap_cache_size;
  98 
  99         mem_start = (mem_start + 15) & ~15;
 100         swap_cache = (unsigned long *) mem_start;
 101         swap_cache_size = MAP_NR(mem_end);
 102         memset(swap_cache, 0, swap_cache_size * sizeof (unsigned long));
 103         return (unsigned long) (swap_cache + swap_cache_size);
 104 }
 105 
 106 void rw_swap_page(int rw, unsigned long entry, char * buf)
     /* [previous][next][first][last][top][bottom][index][help] */
 107 {
 108         unsigned long type, offset;
 109         struct swap_info_struct * p;
 110 
 111         type = SWP_TYPE(entry);
 112         if (type >= nr_swapfiles) {
 113                 printk("Internal error: bad swap-device\n");
 114                 return;
 115         }
 116         p = &swap_info[type];
 117         offset = SWP_OFFSET(entry);
 118         if (offset >= p->max) {
 119                 printk("rw_swap_page: weirdness\n");
 120                 return;
 121         }
 122         if (p->swap_map && !p->swap_map[offset]) {
 123                 printk("Hmm.. Trying to use unallocated swap (%08lx)\n", entry);
 124                 return;
 125         }
 126         if (!(p->flags & SWP_USED)) {
 127                 printk("Trying to swap to unused swap-device\n");
 128                 return;
 129         }
 130         while (set_bit(offset,p->swap_lockmap))
 131                 sleep_on(&lock_queue);
 132         if (rw == READ)
 133                 kstat.pswpin++;
 134         else
 135                 kstat.pswpout++;
 136         if (p->swap_device) {
 137                 ll_rw_page(rw,p->swap_device,offset,buf);
 138         } else if (p->swap_file) {
 139                 struct inode *swapf = p->swap_file;
 140                 unsigned int zones[8];
 141                 int i;
 142                 if (swapf->i_op->bmap == NULL
 143                         && swapf->i_op->smap != NULL){
 144                         /*
 145                                 With MsDOS, we use msdos_smap which return
 146                                 a sector number (not a cluster or block number).
 147                                 It is a patch to enable the UMSDOS project.
 148                                 Other people are working on better solution.
 149 
 150                                 It sounds like ll_rw_swap_file defined
 151                                 it operation size (sector size) based on
 152                                 PAGE_SIZE and the number of block to read.
 153                                 So using bmap or smap should work even if
 154                                 smap will require more blocks.
 155                         */
 156                         int j;
 157                         unsigned int block = offset << 3;
 158 
 159                         for (i=0, j=0; j< PAGE_SIZE ; i++, j += 512){
 160                                 if (!(zones[i] = swapf->i_op->smap(swapf,block++))) {
 161                                         printk("rw_swap_page: bad swap file\n");
 162                                         return;
 163                                 }
 164                         }
 165                 }else{
 166                         int j;
 167                         unsigned int block = offset
 168                                 << (12 - swapf->i_sb->s_blocksize_bits);
 169 
 170                         for (i=0, j=0; j< PAGE_SIZE ; i++, j +=swapf->i_sb->s_blocksize)
 171                                 if (!(zones[i] = bmap(swapf,block++))) {
 172                                         printk("rw_swap_page: bad swap file\n");
 173                                         return;
 174                                 }
 175                 }
 176                 ll_rw_swap_file(rw,swapf->i_dev, zones, i,buf);
 177         } else
 178                 printk("re_swap_page: no swap file or device\n");
 179         if (offset && !clear_bit(offset,p->swap_lockmap))
 180                 printk("rw_swap_page: lock already cleared\n");
 181         wake_up(&lock_queue);
 182 }
 183 
 184 unsigned int get_swap_page(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 185 {
 186         struct swap_info_struct * p;
 187         unsigned int offset, type;
 188 
 189         p = swap_info;
 190         for (type = 0 ; type < nr_swapfiles ; type++,p++) {
 191                 if ((p->flags & SWP_WRITEOK) != SWP_WRITEOK)
 192                         continue;
 193                 for (offset = p->lowest_bit; offset <= p->highest_bit ; offset++) {
 194                         if (p->swap_map[offset])
 195                                 continue;
 196                         p->swap_map[offset] = 1;
 197                         nr_swap_pages--;
 198                         if (offset == p->highest_bit)
 199                                 p->highest_bit--;
 200                         p->lowest_bit = offset;
 201                         return SWP_ENTRY(type,offset);
 202                 }
 203         }
 204         return 0;
 205 }
 206 
 207 void swap_duplicate(unsigned long entry)
     /* [previous][next][first][last][top][bottom][index][help] */
 208 {
 209         struct swap_info_struct * p;
 210         unsigned long offset, type;
 211 
 212         if (!entry)
 213                 return;
 214         offset = SWP_OFFSET(entry);
 215         type = SWP_TYPE(entry);
 216         if (type == SHM_SWP_TYPE)
 217                 return;
 218         if (type >= nr_swapfiles) {
 219                 printk("Trying to duplicate nonexistent swap-page\n");
 220                 return;
 221         }
 222         p = type + swap_info;
 223         if (offset >= p->max) {
 224                 printk("swap_duplicate: weirdness\n");
 225                 return;
 226         }
 227         if (!p->swap_map[offset]) {
 228                 printk("swap_duplicate: trying to duplicate unused page\n");
 229                 return;
 230         }
 231         p->swap_map[offset]++;
 232         return;
 233 }
 234 
 235 void swap_free(unsigned long entry)
     /* [previous][next][first][last][top][bottom][index][help] */
 236 {
 237         struct swap_info_struct * p;
 238         unsigned long offset, type;
 239 
 240         if (!entry)
 241                 return;
 242         type = SWP_TYPE(entry);
 243         if (type == SHM_SWP_TYPE)
 244                 return;
 245         if (type >= nr_swapfiles) {
 246                 printk("Trying to free nonexistent swap-page\n");
 247                 return;
 248         }
 249         p = & swap_info[type];
 250         offset = SWP_OFFSET(entry);
 251         if (offset >= p->max) {
 252                 printk("swap_free: weirdness\n");
 253                 return;
 254         }
 255         if (!(p->flags & SWP_USED)) {
 256                 printk("Trying to free swap from unused swap-device\n");
 257                 return;
 258         }
 259         while (set_bit(offset,p->swap_lockmap))
 260                 sleep_on(&lock_queue);
 261         if (offset < p->lowest_bit)
 262                 p->lowest_bit = offset;
 263         if (offset > p->highest_bit)
 264                 p->highest_bit = offset;
 265         if (!p->swap_map[offset])
 266                 printk("swap_free: swap-space map bad (entry %08lx)\n",entry);
 267         else
 268                 if (!--p->swap_map[offset])
 269                         nr_swap_pages++;
 270         if (!clear_bit(offset,p->swap_lockmap))
 271                 printk("swap_free: lock already cleared\n");
 272         wake_up(&lock_queue);
 273 }
 274 
 275 /*
 276  * The tests may look silly, but it essentially makes sure that
 277  * no other process did a swap-in on us just as we were waiting.
 278  *
 279  * Also, don't bother to add to the swap cache if this page-in
 280  * was due to a write access.
 281  */
 282 void swap_in(struct vm_area_struct * vma, pte_t * page_table,
     /* [previous][next][first][last][top][bottom][index][help] */
 283         unsigned long entry, int write_access)
 284 {
 285         unsigned long page = get_free_page(GFP_KERNEL);
 286 
 287         if (pte_val(*page_table) != entry) {
 288                 free_page(page);
 289                 return;
 290         }
 291         if (!page) {
 292                 *page_table = BAD_PAGE;
 293                 swap_free(entry);
 294                 oom(current);
 295                 return;
 296         }
 297         read_swap_page(entry, (char *) page);
 298         if (pte_val(*page_table) != entry) {
 299                 free_page(page);
 300                 return;
 301         }
 302         vma->vm_task->mm->rss++;
 303         vma->vm_task->mm->maj_flt++;
 304         if (!write_access && add_to_swap_cache(page, entry)) {
 305                 *page_table = mk_pte(page, vma->vm_page_prot);
 306                 return;
 307         }
 308         *page_table = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 309         swap_free(entry);
 310         return;
 311 }
 312 
 313 static inline int try_to_swap_out(struct vm_area_struct* vma, unsigned offset, pte_t * page_table)
     /* [previous][next][first][last][top][bottom][index][help] */
 314 {
 315         pte_t pte;
 316         unsigned long entry;
 317         unsigned long page;
 318 
 319         pte = *page_table;
 320         if (!pte_present(pte))
 321                 return 0;
 322         page = pte_page(pte);
 323         if (page >= high_memory)
 324                 return 0;
 325         if (mem_map[MAP_NR(page)] & MAP_PAGE_RESERVED)
 326                 return 0;
 327         if ((pte_dirty(pte) && delete_from_swap_cache(page)) || pte_young(pte))  {
 328                 *page_table = pte_mkold(pte);
 329                 return 0;
 330         }       
 331         if (pte_dirty(pte)) {
 332                 if (mem_map[MAP_NR(page)] != 1)
 333                         return 0;
 334                 if (vma->vm_ops && vma->vm_ops->swapout)
 335                         vma->vm_ops->swapout(vma, offset, page_table);
 336                 else {
 337                         if (!(entry = get_swap_page()))
 338                                 return 0;
 339                         pte_val(*page_table) = entry;
 340                         invalidate();
 341                         write_swap_page(entry, (char *) page);
 342                 }
 343                 free_page(page);
 344                 return 1 + mem_map[MAP_NR(page)];
 345         }
 346         if ((entry = find_in_swap_cache(page)))  {
 347                 if (mem_map[MAP_NR(page)] != 1) {
 348                         *page_table = pte_mkdirty(pte);
 349                         printk("Aiee.. duplicated cached swap-cache entry\n");
 350                         return 0;
 351                 }
 352                 pte_val(*page_table) = entry;
 353                 invalidate();
 354                 free_page(page);
 355                 return 1;
 356         } 
 357         pte_clear(page_table);
 358         invalidate();
 359         free_page(page);
 360         return 1 + mem_map[MAP_NR(page)];
 361 }
 362 
 363 /*
 364  * A new implementation of swap_out().  We do not swap complete processes,
 365  * but only a small number of blocks, before we continue with the next
 366  * process.  The number of blocks actually swapped is determined on the
 367  * number of page faults, that this process actually had in the last time,
 368  * so we won't swap heavily used processes all the time ...
 369  *
 370  * Note: the priority argument is a hint on much CPU to waste with the
 371  *       swap block search, not a hint, of how much blocks to swap with
 372  *       each process.
 373  *
 374  * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
 375  */
 376 
 377 /*
 378  * These are the minimum and maximum number of pages to swap from one process,
 379  * before proceeding to the next:
 380  */
 381 #define SWAP_MIN        4
 382 #define SWAP_MAX        32
 383 
 384 /*
 385  * The actual number of pages to swap is determined as:
 386  * SWAP_RATIO / (number of recent major page faults)
 387  */
 388 #define SWAP_RATIO      128
 389 
 390 static int swap_out_process(struct task_struct * p)
     /* [previous][next][first][last][top][bottom][index][help] */
 391 {
 392         pgd_t *pgdir;
 393         unsigned long address;
 394         unsigned long offset;
 395         struct vm_area_struct* vma;
 396 
 397         /*
 398          * Go through process' page directory.
 399          */
 400         address = p->mm->swap_address;
 401         p->mm->swap_address = 0;
 402 
 403         /*
 404          * Find the proper vm-area
 405          */
 406         vma = find_vma(p, address);
 407         if (!vma)
 408                 return 0;
 409         if (address < vma->vm_start)
 410                 address = vma->vm_start;
 411 
 412         pgdir = PAGE_DIR_OFFSET(p, address);
 413         offset = address & ~PGDIR_MASK;
 414         address &= PGDIR_MASK;
 415         for ( ; address < TASK_SIZE ; pgdir++, address = address + PGDIR_SIZE, offset = 0) {
 416                 pte_t *pg_table;
 417 
 418                 if (pgd_none(*pgdir))
 419                         continue;
 420                 if (pgd_bad(*pgdir)) {
 421                         printk("Bad page directory at address %08lx: %08lx\n", address, pgd_val(*pgdir));
 422                         pgd_clear(pgdir);
 423                         continue;
 424                 }
 425                 pg_table = (pte_t *) pgd_page(*pgdir);
 426                 if (mem_map[MAP_NR((unsigned long) pg_table)] & MAP_PAGE_RESERVED)
 427                         continue;
 428                 pg_table += offset >> PAGE_SHIFT;
 429 
 430                 /*
 431                  * Go through this page table.
 432                  */
 433                 for( ; offset < ~PGDIR_MASK ; pg_table++, offset += PAGE_SIZE) {
 434                         /*
 435                          * Update vma again..
 436                          */
 437                         for (;;) {
 438                                 if (address+offset < vma->vm_end)
 439                                         break;
 440                                 vma = vma->vm_next;
 441                                 if (!vma)
 442                                         return 0;
 443                         }
 444 
 445                         switch(try_to_swap_out(vma, offset+address-vma->vm_start, pg_table)) {
 446                                 case 0:
 447                                         break;
 448 
 449                                 case 1:
 450                                         p->mm->rss--;
 451                                         /* continue with the following page the next time */
 452                                         p->mm->swap_address = address + offset + PAGE_SIZE;
 453                                         return 1;
 454 
 455                                 default:
 456                                         p->mm->rss--;
 457                                         break;
 458                         }
 459                 }
 460         }
 461         /*
 462          * Finish work with this process, if we reached the end of the page
 463          * directory.
 464          */
 465         return 0;
 466 }
 467 
 468 static int swap_out(unsigned int priority)
     /* [previous][next][first][last][top][bottom][index][help] */
 469 {
 470         static int swap_task;
 471         int loop;
 472         int counter = NR_TASKS * 2 >> priority;
 473         struct task_struct *p;
 474 
 475         counter = NR_TASKS * 2 >> priority;
 476         for(; counter >= 0; counter--, swap_task++) {
 477                 /*
 478                  * Check that swap_task is suitable for swapping.  If not, look for
 479                  * the next suitable process.
 480                  */
 481                 loop = 0;
 482                 while(1) {
 483                         if (swap_task >= NR_TASKS) {
 484                                 swap_task = 1;
 485                                 if (loop)
 486                                         /* all processes are unswappable or already swapped out */
 487                                         return 0;
 488                                 loop = 1;
 489                         }
 490 
 491                         p = task[swap_task];
 492                         if (p && p->mm->swappable && p->mm->rss)
 493                                 break;
 494 
 495                         swap_task++;
 496                 }
 497 
 498                 /*
 499                  * Determine the number of pages to swap from this process.
 500                  */
 501                 if (!p->mm->swap_cnt) {
 502                         p->mm->dec_flt = (p->mm->dec_flt * 3) / 4 + p->mm->maj_flt - p->mm->old_maj_flt;
 503                         p->mm->old_maj_flt = p->mm->maj_flt;
 504 
 505                         if (p->mm->dec_flt >= SWAP_RATIO / SWAP_MIN) {
 506                                 p->mm->dec_flt = SWAP_RATIO / SWAP_MIN;
 507                                 p->mm->swap_cnt = SWAP_MIN;
 508                         } else if (p->mm->dec_flt <= SWAP_RATIO / SWAP_MAX)
 509                                 p->mm->swap_cnt = SWAP_MAX;
 510                         else
 511                                 p->mm->swap_cnt = SWAP_RATIO / p->mm->dec_flt;
 512                 }
 513                 if (swap_out_process(p)) {
 514                         if ((--p->mm->swap_cnt) == 0)
 515                                 swap_task++;
 516                         return 1;
 517                 }
 518         }
 519         return 0;
 520 }
 521 
 522 static int try_to_free_page(int priority)
     /* [previous][next][first][last][top][bottom][index][help] */
 523 {
 524         int i=6;
 525 
 526         while (i--) {
 527                 if (priority != GFP_NOBUFFER && shrink_buffers(i))
 528                         return 1;
 529                 if (shm_swap(i))
 530                         return 1;
 531                 if (swap_out(i))
 532                         return 1;
 533         }
 534         return 0;
 535 }
 536 
 537 static inline void add_mem_queue(struct mem_list * head, struct mem_list * entry)
     /* [previous][next][first][last][top][bottom][index][help] */
 538 {
 539         entry->prev = head;
 540         (entry->next = head->next)->prev = entry;
 541         head->next = entry;
 542 }
 543 
 544 static inline void remove_mem_queue(struct mem_list * head, struct mem_list * entry)
     /* [previous][next][first][last][top][bottom][index][help] */
 545 {
 546         entry->next->prev = entry->prev;
 547         entry->prev->next = entry->next;
 548 }
 549 
 550 /*
 551  * Free_page() adds the page to the free lists. This is optimized for
 552  * fast normal cases (no error jumps taken normally).
 553  *
 554  * The way to optimize jumps for gcc-2.2.2 is to:
 555  *  - select the "normal" case and put it inside the if () { XXX }
 556  *  - no else-statements if you can avoid them
 557  *
 558  * With the above two rules, you get a straight-line execution path
 559  * for the normal case, giving better asm-code.
 560  *
 561  * free_page() may sleep since the page being freed may be a buffer
 562  * page or present in the swap cache. It will not sleep, however,
 563  * for a freshly allocated page (get_free_page()).
 564  */
 565 
 566 /*
 567  * Buddy system. Hairy. You really aren't expected to understand this
 568  */
 569 static inline void free_pages_ok(unsigned long addr, unsigned long order)
     /* [previous][next][first][last][top][bottom][index][help] */
 570 {
 571         unsigned long index = MAP_NR(addr) >> (1 + order);
 572         unsigned long mask = PAGE_MASK << order;
 573 
 574         addr &= mask;
 575         nr_free_pages += 1 << order;
 576         while (order < NR_MEM_LISTS-1) {
 577                 if (!change_bit(index, free_area_map[order]))
 578                         break;
 579                 remove_mem_queue(free_area_list+order, (struct mem_list *) (addr ^ (1+~mask)));
 580                 order++;
 581                 index >>= 1;
 582                 mask <<= 1;
 583                 addr &= mask;
 584         }
 585         add_mem_queue(free_area_list+order, (struct mem_list *) addr);
 586 }
 587 
 588 static inline void check_free_buffers(unsigned long addr)
     /* [previous][next][first][last][top][bottom][index][help] */
 589 {
 590         struct buffer_head * bh;
 591 
 592         bh = buffer_pages[MAP_NR(addr)];
 593         if (bh) {
 594                 struct buffer_head *tmp = bh;
 595                 do {
 596                         if (tmp->b_list == BUF_SHARED && tmp->b_dev != 0xffff)
 597                                 refile_buffer(tmp);
 598                         tmp = tmp->b_this_page;
 599                 } while (tmp != bh);
 600         }
 601 }
 602 
 603 void free_pages(unsigned long addr, unsigned long order)
     /* [previous][next][first][last][top][bottom][index][help] */
 604 {
 605         if (addr < high_memory) {
 606                 unsigned long flag;
 607                 mem_map_t * map = mem_map + MAP_NR(addr);
 608                 if (*map) {
 609                         if (!(*map & MAP_PAGE_RESERVED)) {
 610                                 save_flags(flag);
 611                                 cli();
 612                                 if (!--*map)  {
 613                                         free_pages_ok(addr, order);
 614                                         delete_from_swap_cache(addr);
 615                                 }
 616                                 restore_flags(flag);
 617                                 if (*map == 1)
 618                                         check_free_buffers(addr);
 619                         }
 620                         return;
 621                 }
 622                 printk("Trying to free free memory (%08lx): memory probably corrupted\n",addr);
 623                 printk("PC = %p\n", __builtin_return_address(0));
 624                 return;
 625         }
 626 }
 627 
 628 /*
 629  * Some ugly macros to speed up __get_free_pages()..
 630  */
 631 #define RMQUEUE(order) \
 632 do { struct mem_list * queue = free_area_list+order; \
 633      unsigned long new_order = order; \
 634         do { struct mem_list *next = queue->next; \
 635                 if (queue != next) { \
 636                         (queue->next = next->next)->prev = queue; \
 637                         mark_used((unsigned long) next, new_order); \
 638                         nr_free_pages -= 1 << order; \
 639                         restore_flags(flags); \
 640                         EXPAND(next, order, new_order); \
 641                         return (unsigned long) next; \
 642                 } new_order++; queue++; \
 643         } while (new_order < NR_MEM_LISTS); \
 644 } while (0)
 645 
 646 static inline int mark_used(unsigned long addr, unsigned long order)
     /* [previous][next][first][last][top][bottom][index][help] */
 647 {
 648         return change_bit(MAP_NR(addr) >> (1+order), free_area_map[order]);
 649 }
 650 
 651 #define EXPAND(addr,low,high) \
 652 do { unsigned long size = PAGE_SIZE << high; \
 653         while (high > low) { \
 654                 high--; size >>= 1; cli(); \
 655                 add_mem_queue(free_area_list+high, addr); \
 656                 mark_used((unsigned long) addr, high); \
 657                 restore_flags(flags); \
 658                 addr = (struct mem_list *) (size + (unsigned long) addr); \
 659         } mem_map[MAP_NR((unsigned long) addr)] = 1; \
 660 } while (0)
 661 
 662 unsigned long __get_free_pages(int priority, unsigned long order)
     /* [previous][next][first][last][top][bottom][index][help] */
 663 {
 664         unsigned long flags;
 665         int reserved_pages;
 666 
 667         if (intr_count && priority != GFP_ATOMIC) {
 668                 static int count = 0;
 669                 if (++count < 5) {
 670                         printk("gfp called nonatomically from interrupt %p\n",
 671                                 __builtin_return_address(0));
 672                         priority = GFP_ATOMIC;
 673                 }
 674         }
 675         reserved_pages = 5;
 676         if (priority != GFP_NFS)
 677                 reserved_pages = min_free_pages;
 678         save_flags(flags);
 679 repeat:
 680         cli();
 681         if ((priority==GFP_ATOMIC) || nr_free_pages > reserved_pages) {
 682                 RMQUEUE(order);
 683                 restore_flags(flags);
 684                 return 0;
 685         }
 686         restore_flags(flags);
 687         if (priority != GFP_BUFFER && try_to_free_page(priority))
 688                 goto repeat;
 689         return 0;
 690 }
 691 
 692 /*
 693  * Yes, I know this is ugly. Don't tell me.
 694  */
 695 unsigned long __get_dma_pages(int priority, unsigned long order)
     /* [previous][next][first][last][top][bottom][index][help] */
 696 {
 697         unsigned long list = 0;
 698         unsigned long result;
 699         unsigned long limit = MAX_DMA_ADDRESS;
 700 
 701         /* if (EISA_bus) limit = ~0UL; */
 702         if (priority != GFP_ATOMIC)
 703                 priority = GFP_BUFFER;
 704         for (;;) {
 705                 result = __get_free_pages(priority, order);
 706                 if (result < limit) /* covers failure as well */
 707                         break;
 708                 *(unsigned long *) result = list;
 709                 list = result;
 710         }
 711         while (list) {
 712                 unsigned long tmp = list;
 713                 list = *(unsigned long *) list;
 714                 free_pages(tmp, order);
 715         }
 716         return result;
 717 }
 718 
 719 /*
 720  * Show free area list (used inside shift_scroll-lock stuff)
 721  * We also calculate the percentage fragmentation. We do this by counting the
 722  * memory on each free list with the exception of the first item on the list.
 723  */
 724 void show_free_areas(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 725 {
 726         unsigned long order, flags;
 727         unsigned long total = 0;
 728 
 729         printk("Free pages:      %6dkB\n ( ",nr_free_pages<<(PAGE_SHIFT-10));
 730         save_flags(flags);
 731         cli();
 732         for (order=0 ; order < NR_MEM_LISTS; order++) {
 733                 struct mem_list * tmp;
 734                 unsigned long nr = 0;
 735                 for (tmp = free_area_list[order].next ; tmp != free_area_list + order ; tmp = tmp->next) {
 736                         nr ++;
 737                 }
 738                 total += nr * (4 << order);
 739                 printk("%lu*%ukB ", nr, 4 << order);
 740         }
 741         restore_flags(flags);
 742         printk("= %lukB)\n", total);
 743 #ifdef SWAP_CACHE_INFO
 744         show_swap_cache_info();
 745 #endif  
 746 }
 747 
 748 /*
 749  * Trying to stop swapping from a file is fraught with races, so
 750  * we repeat quite a bit here when we have to pause. swapoff()
 751  * isn't exactly timing-critical, so who cares?
 752  */
 753 static int try_to_unuse(unsigned int type)
     /* [previous][next][first][last][top][bottom][index][help] */
 754 {
 755         int nr;
 756         unsigned long tmp = 0;
 757         struct task_struct *p;
 758 
 759         nr = 0; 
 760 /*
 761  * When we have to sleep, we restart the whole algorithm from the same
 762  * task we stopped in. That at least rids us of all races.
 763  */
 764 repeat:
 765         for (; nr < NR_TASKS ; nr++) {
 766                 pgd_t * page_dir;
 767                 int i;
 768 
 769                 p = task[nr];
 770                 if (!p)
 771                         continue;
 772                 page_dir = PAGE_DIR_OFFSET(p, 0);
 773                 for (i = 0 ; i < PTRS_PER_PAGE ; page_dir++, i++) {
 774                         int j;
 775                         pte_t *page_table;
 776 
 777                         if (pgd_none(*page_dir))
 778                                 continue;
 779                         if (pgd_bad(*page_dir)) {
 780                                 printk("bad page directory entry [%d] %08lx\n", i, pgd_val(*page_dir));
 781                                 pgd_clear(page_dir);
 782                                 continue;
 783                         }
 784                         page_table = (pte_t *) pgd_page(*page_dir);
 785                         if (mem_map[MAP_NR((unsigned long) page_table)] & MAP_PAGE_RESERVED)
 786                                 continue;
 787                         for (j = 0 ; j < PTRS_PER_PAGE ; page_table++, j++) {
 788                                 pte_t pte;
 789                                 pte = *page_table;
 790                                 if (pte_none(pte))
 791                                         continue;
 792                                 if (pte_present(pte)) {
 793                                         unsigned long page = pte_page(pte);
 794                                         if (page >= high_memory)
 795                                                 continue;
 796                                         if (!in_swap_cache(page))
 797                                                 continue;
 798                                         if (SWP_TYPE(in_swap_cache(page)) != type)
 799                                                 continue;
 800                                         delete_from_swap_cache(page);
 801                                         *page_table = pte_mkdirty(pte);
 802                                         continue;
 803                                 }
 804                                 if (SWP_TYPE(pte_val(pte)) != type)
 805                                         continue;
 806                                 if (!tmp) {
 807                                         if (!(tmp = __get_free_page(GFP_KERNEL)))
 808                                                 return -ENOMEM;
 809                                         goto repeat;
 810                                 }
 811                                 read_swap_page(pte_val(pte), (char *) tmp);
 812                                 if (pte_val(*page_table) != pte_val(pte))
 813                                         goto repeat;
 814                                 *page_table = pte_mkwrite(pte_mkdirty(mk_pte(tmp, PAGE_COPY)));
 815                                 ++p->mm->rss;
 816                                 swap_free(pte_val(pte));
 817                                 tmp = 0;
 818                         }
 819                 }
 820         }
 821         free_page(tmp);
 822         return 0;
 823 }
 824 
 825 asmlinkage int sys_swapoff(const char * specialfile)
     /* [previous][next][first][last][top][bottom][index][help] */
 826 {
 827         struct swap_info_struct * p;
 828         struct inode * inode;
 829         unsigned int type;
 830         struct file filp;
 831         int i;
 832 
 833         if (!suser())
 834                 return -EPERM;
 835         i = namei(specialfile,&inode);
 836         if (i)
 837                 return i;
 838         p = swap_info;
 839         for (type = 0 ; type < nr_swapfiles ; type++,p++) {
 840                 if ((p->flags & SWP_WRITEOK) != SWP_WRITEOK)
 841                         continue;
 842                 if (p->swap_file) {
 843                         if (p->swap_file == inode)
 844                                 break;
 845                 } else {
 846                         if (!S_ISBLK(inode->i_mode))
 847                                 continue;
 848                         if (p->swap_device == inode->i_rdev)
 849                                 break;
 850                 }
 851         }
 852 
 853         if (type >= nr_swapfiles){
 854                 iput(inode);
 855                 return -EINVAL;
 856         }
 857         p->flags = SWP_USED;
 858         i = try_to_unuse(type);
 859         if (i) {
 860                 iput(inode);
 861                 p->flags = SWP_WRITEOK;
 862                 return i;
 863         }
 864 
 865         if(p->swap_device){
 866                 memset(&filp, 0, sizeof(filp));         
 867                 filp.f_inode = inode;
 868                 filp.f_mode = 3; /* read write */
 869                 /* open it again to get fops */
 870                 if( !blkdev_open(inode, &filp) &&
 871                    filp.f_op && filp.f_op->release){
 872                         filp.f_op->release(inode,&filp);
 873                         filp.f_op->release(inode,&filp);
 874                 }
 875         }
 876         iput(inode);
 877 
 878         nr_swap_pages -= p->pages;
 879         iput(p->swap_file);
 880         p->swap_file = NULL;
 881         p->swap_device = 0;
 882         vfree(p->swap_map);
 883         p->swap_map = NULL;
 884         free_page((long) p->swap_lockmap);
 885         p->swap_lockmap = NULL;
 886         p->flags = 0;
 887         return 0;
 888 }
 889 
 890 /*
 891  * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
 892  *
 893  * The swapon system call
 894  */
 895 asmlinkage int sys_swapon(const char * specialfile)
     /* [previous][next][first][last][top][bottom][index][help] */
 896 {
 897         struct swap_info_struct * p;
 898         struct inode * swap_inode;
 899         unsigned int type;
 900         int i,j;
 901         int error;
 902         struct file filp;
 903 
 904         memset(&filp, 0, sizeof(filp));
 905         if (!suser())
 906                 return -EPERM;
 907         p = swap_info;
 908         for (type = 0 ; type < nr_swapfiles ; type++,p++)
 909                 if (!(p->flags & SWP_USED))
 910                         break;
 911         if (type >= MAX_SWAPFILES)
 912                 return -EPERM;
 913         if (type >= nr_swapfiles)
 914                 nr_swapfiles = type+1;
 915         p->flags = SWP_USED;
 916         p->swap_file = NULL;
 917         p->swap_device = 0;
 918         p->swap_map = NULL;
 919         p->swap_lockmap = NULL;
 920         p->lowest_bit = 0;
 921         p->highest_bit = 0;
 922         p->max = 1;
 923         error = namei(specialfile,&swap_inode);
 924         if (error)
 925                 goto bad_swap_2;
 926         p->swap_file = swap_inode;
 927         error = -EBUSY;
 928         if (swap_inode->i_count != 1)
 929                 goto bad_swap_2;
 930         error = -EINVAL;
 931 
 932         if (S_ISBLK(swap_inode->i_mode)) {
 933                 p->swap_device = swap_inode->i_rdev;
 934 
 935                 filp.f_inode = swap_inode;
 936                 filp.f_mode = 3; /* read write */
 937                 error = blkdev_open(swap_inode, &filp);
 938                 p->swap_file = NULL;
 939                 iput(swap_inode);
 940                 if(error)
 941                         goto bad_swap_2;
 942                 error = -ENODEV;
 943                 if (!p->swap_device)
 944                         goto bad_swap;
 945                 error = -EBUSY;
 946                 for (i = 0 ; i < nr_swapfiles ; i++) {
 947                         if (i == type)
 948                                 continue;
 949                         if (p->swap_device == swap_info[i].swap_device)
 950                                 goto bad_swap;
 951                 }
 952         } else if (!S_ISREG(swap_inode->i_mode))
 953                 goto bad_swap;
 954         p->swap_lockmap = (unsigned char *) get_free_page(GFP_USER);
 955         if (!p->swap_lockmap) {
 956                 printk("Unable to start swapping: out of memory :-)\n");
 957                 error = -ENOMEM;
 958                 goto bad_swap;
 959         }
 960         read_swap_page(SWP_ENTRY(type,0), (char *) p->swap_lockmap);
 961         if (memcmp("SWAP-SPACE",p->swap_lockmap+4086,10)) {
 962                 printk("Unable to find swap-space signature\n");
 963                 error = -EINVAL;
 964                 goto bad_swap;
 965         }
 966         memset(p->swap_lockmap+PAGE_SIZE-10,0,10);
 967         j = 0;
 968         p->lowest_bit = 0;
 969         p->highest_bit = 0;
 970         for (i = 1 ; i < 8*PAGE_SIZE ; i++) {
 971                 if (test_bit(i,p->swap_lockmap)) {
 972                         if (!p->lowest_bit)
 973                                 p->lowest_bit = i;
 974                         p->highest_bit = i;
 975                         p->max = i+1;
 976                         j++;
 977                 }
 978         }
 979         if (!j) {
 980                 printk("Empty swap-file\n");
 981                 error = -EINVAL;
 982                 goto bad_swap;
 983         }
 984         p->swap_map = (unsigned char *) vmalloc(p->max);
 985         if (!p->swap_map) {
 986                 error = -ENOMEM;
 987                 goto bad_swap;
 988         }
 989         for (i = 1 ; i < p->max ; i++) {
 990                 if (test_bit(i,p->swap_lockmap))
 991                         p->swap_map[i] = 0;
 992                 else
 993                         p->swap_map[i] = 0x80;
 994         }
 995         p->swap_map[0] = 0x80;
 996         memset(p->swap_lockmap,0,PAGE_SIZE);
 997         p->flags = SWP_WRITEOK;
 998         p->pages = j;
 999         nr_swap_pages += j;
1000         printk("Adding Swap: %dk swap-space\n",j<<2);
1001         return 0;
1002 bad_swap:
1003         if(filp.f_op && filp.f_op->release)
1004                 filp.f_op->release(filp.f_inode,&filp);
1005 bad_swap_2:
1006         free_page((long) p->swap_lockmap);
1007         vfree(p->swap_map);
1008         iput(p->swap_file);
1009         p->swap_device = 0;
1010         p->swap_file = NULL;
1011         p->swap_map = NULL;
1012         p->swap_lockmap = NULL;
1013         p->flags = 0;
1014         return error;
1015 }
1016 
1017 void si_swapinfo(struct sysinfo *val)
     /* [previous][next][first][last][top][bottom][index][help] */
1018 {
1019         unsigned int i, j;
1020 
1021         val->freeswap = val->totalswap = 0;
1022         for (i = 0; i < nr_swapfiles; i++) {
1023                 if ((swap_info[i].flags & SWP_WRITEOK) != SWP_WRITEOK)
1024                         continue;
1025                 for (j = 0; j < swap_info[i].max; ++j)
1026                         switch (swap_info[i].swap_map[j]) {
1027                                 case 128:
1028                                         continue;
1029                                 case 0:
1030                                         ++val->freeswap;
1031                                 default:
1032                                         ++val->totalswap;
1033                         }
1034         }
1035         val->freeswap <<= PAGE_SHIFT;
1036         val->totalswap <<= PAGE_SHIFT;
1037         return;
1038 }
1039 
1040 /*
1041  * set up the free-area data structures:
1042  *   - mark all pages MAP_PAGE_RESERVED
1043  *   - mark all memory queues empty
1044  *   - clear the memory bitmaps
1045  */
1046 unsigned long free_area_init(unsigned long start_mem, unsigned long end_mem)
     /* [previous][next][first][last][top][bottom][index][help] */
1047 {
1048         mem_map_t * p;
1049         unsigned long mask = PAGE_MASK;
1050         int i;
1051 
1052         /*
1053          * select nr of pages we try to keep free for important stuff
1054          * with a minimum of 16 pages. This is totally arbitrary
1055          */
1056         i = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT+6);
1057         if (i < 16)
1058                 i = 16;
1059         min_free_pages = i;
1060         start_mem = init_swap_cache(start_mem, end_mem);
1061         mem_map = (mem_map_t *) start_mem;
1062         p = mem_map + MAP_NR(end_mem);
1063         start_mem = (unsigned long) p;
1064         while (p > mem_map)
1065                 *--p = MAP_PAGE_RESERVED;
1066 
1067         for (i = 0 ; i < NR_MEM_LISTS ; i++) {
1068                 unsigned long bitmap_size;
1069                 free_area_list[i].prev = free_area_list[i].next = &free_area_list[i];
1070                 mask += mask;
1071                 end_mem = (end_mem + ~mask) & mask;
1072                 bitmap_size = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT + i);
1073                 bitmap_size = (bitmap_size + 7) >> 3;
1074                 bitmap_size = (bitmap_size + sizeof(unsigned long) - 1) & ~(sizeof(unsigned long)-1);
1075                 free_area_map[i] = (unsigned char *) start_mem;
1076                 memset((void *) start_mem, 0, bitmap_size);
1077                 start_mem += bitmap_size;
1078         }
1079         return start_mem;
1080 }

/* [previous][next][first][last][top][bottom][index][help] */