root/mm/swap.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. show_swap_cache_info
  2. add_to_swap_cache
  3. init_swap_cache
  4. rw_swap_page
  5. get_swap_page
  6. swap_duplicate
  7. swap_free
  8. swap_in
  9. try_to_swap_out
  10. swap_out_process
  11. swap_out
  12. try_to_free_page
  13. add_mem_queue
  14. remove_mem_queue
  15. free_pages_ok
  16. check_free_buffers
  17. free_pages
  18. mark_used
  19. __get_free_pages
  20. __get_dma_pages
  21. show_free_areas
  22. try_to_unuse
  23. sys_swapoff
  24. sys_swapon
  25. si_swapinfo
  26. free_area_init

   1 /*
   2  *  linux/mm/swap.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  */
   6 
   7 /*
   8  * This file should contain most things doing the swapping from/to disk.
   9  * Started 18.12.91
  10  */
  11 
  12 #include <linux/mm.h>
  13 #include <linux/sched.h>
  14 #include <linux/head.h>
  15 #include <linux/kernel.h>
  16 #include <linux/kernel_stat.h>
  17 #include <linux/errno.h>
  18 #include <linux/string.h>
  19 #include <linux/stat.h>
  20 #include <linux/fs.h>
  21 
  22 #include <asm/dma.h>
  23 #include <asm/system.h> /* for cli()/sti() */
  24 #include <asm/bitops.h>
  25 
  26 #define MAX_SWAPFILES 8
  27 
  28 #define SWP_USED        1
  29 #define SWP_WRITEOK     3
  30 
  31 #define SWP_TYPE(entry) (((entry) >> 1) & 0x7f)
  32 #define SWP_OFFSET(entry) ((entry) >> 12)
  33 #define SWP_ENTRY(type,offset) (((type) << 1) | ((offset) << 12))
  34 
  35 int min_free_pages = 20;
  36 
  37 static int nr_swapfiles = 0;
  38 static struct wait_queue * lock_queue = NULL;
  39 
  40 static struct swap_info_struct {
  41         unsigned long flags;
  42         struct inode * swap_file;
  43         unsigned int swap_device;
  44         unsigned char * swap_map;
  45         unsigned char * swap_lockmap;
  46         int pages;
  47         int lowest_bit;
  48         int highest_bit;
  49         unsigned long max;
  50 } swap_info[MAX_SWAPFILES];
  51 
  52 extern int shm_swap (int);
  53 
  54 unsigned long *swap_cache;
  55 
  56 #ifdef SWAP_CACHE_INFO
  57 unsigned long swap_cache_add_total = 0;
  58 unsigned long swap_cache_add_success = 0;
  59 unsigned long swap_cache_del_total = 0;
  60 unsigned long swap_cache_del_success = 0;
  61 unsigned long swap_cache_find_total = 0;
  62 unsigned long swap_cache_find_success = 0;
  63 
  64 extern inline void show_swap_cache_info(void)
     /* [previous][next][first][last][top][bottom][index][help] */
  65 {
  66         printk("Swap cache: add %ld/%ld, delete %ld/%ld, find %ld/%ld\n",
  67                 swap_cache_add_total, swap_cache_add_success, 
  68                 swap_cache_del_total, swap_cache_del_success,
  69                 swap_cache_find_total, swap_cache_find_success);
  70 }
  71 #endif
  72 
  73 static int add_to_swap_cache(unsigned long addr, unsigned long entry)
     /* [previous][next][first][last][top][bottom][index][help] */
  74 {
  75         struct swap_info_struct * p = &swap_info[SWP_TYPE(entry)];
  76 
  77 #ifdef SWAP_CACHE_INFO
  78         swap_cache_add_total++;
  79 #endif
  80         if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
  81                 entry = (unsigned long) xchg_ptr(swap_cache + MAP_NR(addr), (void *) entry);
  82                 if (entry)  {
  83                         printk("swap_cache: replacing non-NULL entry\n");
  84                 }
  85 #ifdef SWAP_CACHE_INFO
  86                 swap_cache_add_success++;
  87 #endif
  88                 return 1;
  89         }
  90         return 0;
  91 }
  92 
  93 static unsigned long init_swap_cache(unsigned long mem_start,
     /* [previous][next][first][last][top][bottom][index][help] */
  94         unsigned long mem_end)
  95 {
  96         unsigned long swap_cache_size;
  97 
  98         mem_start = (mem_start + 15) & ~15;
  99         swap_cache = (unsigned long *) mem_start;
 100         swap_cache_size = MAP_NR(mem_end);
 101         memset(swap_cache, 0, swap_cache_size * sizeof (unsigned long));
 102         return (unsigned long) (swap_cache + swap_cache_size);
 103 }
 104 
 105 void rw_swap_page(int rw, unsigned long entry, char * buf)
     /* [previous][next][first][last][top][bottom][index][help] */
 106 {
 107         unsigned long type, offset;
 108         struct swap_info_struct * p;
 109 
 110         type = SWP_TYPE(entry);
 111         if (type >= nr_swapfiles) {
 112                 printk("Internal error: bad swap-device\n");
 113                 return;
 114         }
 115         p = &swap_info[type];
 116         offset = SWP_OFFSET(entry);
 117         if (offset >= p->max) {
 118                 printk("rw_swap_page: weirdness\n");
 119                 return;
 120         }
 121         if (p->swap_map && !p->swap_map[offset]) {
 122                 printk("Hmm.. Trying to use unallocated swap (%08lx)\n", entry);
 123                 return;
 124         }
 125         if (!(p->flags & SWP_USED)) {
 126                 printk("Trying to swap to unused swap-device\n");
 127                 return;
 128         }
 129         while (set_bit(offset,p->swap_lockmap))
 130                 sleep_on(&lock_queue);
 131         if (rw == READ)
 132                 kstat.pswpin++;
 133         else
 134                 kstat.pswpout++;
 135         if (p->swap_device) {
 136                 ll_rw_page(rw,p->swap_device,offset,buf);
 137         } else if (p->swap_file) {
 138                 struct inode *swapf = p->swap_file;
 139                 unsigned int zones[8];
 140                 int i;
 141                 if (swapf->i_op->bmap == NULL
 142                         && swapf->i_op->smap != NULL){
 143                         /*
 144                                 With MsDOS, we use msdos_smap which return
 145                                 a sector number (not a cluster or block number).
 146                                 It is a patch to enable the UMSDOS project.
 147                                 Other people are working on better solution.
 148 
 149                                 It sounds like ll_rw_swap_file defined
 150                                 it operation size (sector size) based on
 151                                 PAGE_SIZE and the number of block to read.
 152                                 So using bmap or smap should work even if
 153                                 smap will require more blocks.
 154                         */
 155                         int j;
 156                         unsigned int block = offset << 3;
 157 
 158                         for (i=0, j=0; j< PAGE_SIZE ; i++, j += 512){
 159                                 if (!(zones[i] = swapf->i_op->smap(swapf,block++))) {
 160                                         printk("rw_swap_page: bad swap file\n");
 161                                         return;
 162                                 }
 163                         }
 164                 }else{
 165                         int j;
 166                         unsigned int block = offset
 167                                 << (12 - swapf->i_sb->s_blocksize_bits);
 168 
 169                         for (i=0, j=0; j< PAGE_SIZE ; i++, j +=swapf->i_sb->s_blocksize)
 170                                 if (!(zones[i] = bmap(swapf,block++))) {
 171                                         printk("rw_swap_page: bad swap file\n");
 172                                         return;
 173                                 }
 174                 }
 175                 ll_rw_swap_file(rw,swapf->i_dev, zones, i,buf);
 176         } else
 177                 printk("re_swap_page: no swap file or device\n");
 178         if (offset && !clear_bit(offset,p->swap_lockmap))
 179                 printk("rw_swap_page: lock already cleared\n");
 180         wake_up(&lock_queue);
 181 }
 182 
 183 unsigned int get_swap_page(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 184 {
 185         struct swap_info_struct * p;
 186         unsigned int offset, type;
 187 
 188         p = swap_info;
 189         for (type = 0 ; type < nr_swapfiles ; type++,p++) {
 190                 if ((p->flags & SWP_WRITEOK) != SWP_WRITEOK)
 191                         continue;
 192                 for (offset = p->lowest_bit; offset <= p->highest_bit ; offset++) {
 193                         if (p->swap_map[offset])
 194                                 continue;
 195                         p->swap_map[offset] = 1;
 196                         nr_swap_pages--;
 197                         if (offset == p->highest_bit)
 198                                 p->highest_bit--;
 199                         p->lowest_bit = offset;
 200                         return SWP_ENTRY(type,offset);
 201                 }
 202         }
 203         return 0;
 204 }
 205 
 206 void swap_duplicate(unsigned long entry)
     /* [previous][next][first][last][top][bottom][index][help] */
 207 {
 208         struct swap_info_struct * p;
 209         unsigned long offset, type;
 210 
 211         if (!entry)
 212                 return;
 213         offset = SWP_OFFSET(entry);
 214         type = SWP_TYPE(entry);
 215         if (type == SHM_SWP_TYPE)
 216                 return;
 217         if (type >= nr_swapfiles) {
 218                 printk("Trying to duplicate nonexistent swap-page\n");
 219                 return;
 220         }
 221         p = type + swap_info;
 222         if (offset >= p->max) {
 223                 printk("swap_duplicate: weirdness\n");
 224                 return;
 225         }
 226         if (!p->swap_map[offset]) {
 227                 printk("swap_duplicate: trying to duplicate unused page\n");
 228                 return;
 229         }
 230         p->swap_map[offset]++;
 231         return;
 232 }
 233 
 234 void swap_free(unsigned long entry)
     /* [previous][next][first][last][top][bottom][index][help] */
 235 {
 236         struct swap_info_struct * p;
 237         unsigned long offset, type;
 238 
 239         if (!entry)
 240                 return;
 241         type = SWP_TYPE(entry);
 242         if (type == SHM_SWP_TYPE)
 243                 return;
 244         if (type >= nr_swapfiles) {
 245                 printk("Trying to free nonexistent swap-page\n");
 246                 return;
 247         }
 248         p = & swap_info[type];
 249         offset = SWP_OFFSET(entry);
 250         if (offset >= p->max) {
 251                 printk("swap_free: weirdness\n");
 252                 return;
 253         }
 254         if (!(p->flags & SWP_USED)) {
 255                 printk("Trying to free swap from unused swap-device\n");
 256                 return;
 257         }
 258         while (set_bit(offset,p->swap_lockmap))
 259                 sleep_on(&lock_queue);
 260         if (offset < p->lowest_bit)
 261                 p->lowest_bit = offset;
 262         if (offset > p->highest_bit)
 263                 p->highest_bit = offset;
 264         if (!p->swap_map[offset])
 265                 printk("swap_free: swap-space map bad (entry %08lx)\n",entry);
 266         else
 267                 if (!--p->swap_map[offset])
 268                         nr_swap_pages++;
 269         if (!clear_bit(offset,p->swap_lockmap))
 270                 printk("swap_free: lock already cleared\n");
 271         wake_up(&lock_queue);
 272 }
 273 
 274 /*
 275  * The tests may look silly, but it essentially makes sure that
 276  * no other process did a swap-in on us just as we were waiting.
 277  *
 278  * Also, don't bother to add to the swap cache if this page-in
 279  * was due to a write access.
 280  */
 281 void swap_in(struct vm_area_struct * vma, pte_t * page_table,
     /* [previous][next][first][last][top][bottom][index][help] */
 282         unsigned long entry, int write_access)
 283 {
 284         unsigned long page = get_free_page(GFP_KERNEL);
 285 
 286         if (pte_val(*page_table) != entry) {
 287                 free_page(page);
 288                 return;
 289         }
 290         if (!page) {
 291                 *page_table = BAD_PAGE;
 292                 swap_free(entry);
 293                 oom(current);
 294                 return;
 295         }
 296         read_swap_page(entry, (char *) page);
 297         if (pte_val(*page_table) != entry) {
 298                 free_page(page);
 299                 return;
 300         }
 301         vma->vm_task->mm->rss++;
 302         vma->vm_task->mm->maj_flt++;
 303         if (!write_access && add_to_swap_cache(page, entry)) {
 304                 *page_table = mk_pte(page, vma->vm_page_prot);
 305                 return;
 306         }
 307         *page_table = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 308         swap_free(entry);
 309         return;
 310 }
 311 
 312 static inline int try_to_swap_out(struct vm_area_struct* vma, unsigned offset, pte_t * page_table)
     /* [previous][next][first][last][top][bottom][index][help] */
 313 {
 314         pte_t pte;
 315         unsigned long entry;
 316         unsigned long page;
 317 
 318         pte = *page_table;
 319         if (!pte_present(pte))
 320                 return 0;
 321         page = pte_page(pte);
 322         if (page >= high_memory)
 323                 return 0;
 324         if (mem_map[MAP_NR(page)] & MAP_PAGE_RESERVED)
 325                 return 0;
 326         if ((pte_dirty(pte) && delete_from_swap_cache(page)) || pte_young(pte))  {
 327                 *page_table = pte_mkold(pte);
 328                 return 0;
 329         }       
 330         if (pte_dirty(pte)) {
 331                 if (mem_map[MAP_NR(page)] != 1)
 332                         return 0;
 333                 if (vma->vm_ops && vma->vm_ops->swapout)
 334                         vma->vm_ops->swapout(vma, offset, page_table);
 335                 else {
 336                         if (!(entry = get_swap_page()))
 337                                 return 0;
 338                         pte_val(*page_table) = entry;
 339                         invalidate();
 340                         write_swap_page(entry, (char *) page);
 341                 }
 342                 free_page(page);
 343                 return 1 + mem_map[MAP_NR(page)];
 344         }
 345         if ((entry = find_in_swap_cache(page)))  {
 346                 if (mem_map[MAP_NR(page)] != 1) {
 347                         *page_table = pte_mkdirty(pte);
 348                         printk("Aiee.. duplicated cached swap-cache entry\n");
 349                         return 0;
 350                 }
 351                 pte_val(*page_table) = entry;
 352                 invalidate();
 353                 free_page(page);
 354                 return 1;
 355         } 
 356         pte_clear(page_table);
 357         invalidate();
 358         free_page(page);
 359         return 1 + mem_map[MAP_NR(page)];
 360 }
 361 
 362 /*
 363  * A new implementation of swap_out().  We do not swap complete processes,
 364  * but only a small number of blocks, before we continue with the next
 365  * process.  The number of blocks actually swapped is determined on the
 366  * number of page faults, that this process actually had in the last time,
 367  * so we won't swap heavily used processes all the time ...
 368  *
 369  * Note: the priority argument is a hint on much CPU to waste with the
 370  *       swap block search, not a hint, of how much blocks to swap with
 371  *       each process.
 372  *
 373  * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
 374  */
 375 
 376 /*
 377  * These are the minimum and maximum number of pages to swap from one process,
 378  * before proceeding to the next:
 379  */
 380 #define SWAP_MIN        4
 381 #define SWAP_MAX        32
 382 
 383 /*
 384  * The actual number of pages to swap is determined as:
 385  * SWAP_RATIO / (number of recent major page faults)
 386  */
 387 #define SWAP_RATIO      128
 388 
 389 static int swap_out_process(struct task_struct * p)
     /* [previous][next][first][last][top][bottom][index][help] */
 390 {
 391         pgd_t *pgdir;
 392         unsigned long address;
 393         unsigned long offset;
 394         struct vm_area_struct* vma;
 395 
 396         /*
 397          * Go through process' page directory.
 398          */
 399         address = p->mm->swap_address;
 400         p->mm->swap_address = 0;
 401 
 402         /*
 403          * Find the proper vm-area
 404          */
 405         vma = find_vma(p, address);
 406         if (!vma)
 407                 return 0;
 408         if (address < vma->vm_start)
 409                 address = vma->vm_start;
 410 
 411         pgdir = PAGE_DIR_OFFSET(p, address);
 412         offset = address & ~PGDIR_MASK;
 413         address &= PGDIR_MASK;
 414         for ( ; address < TASK_SIZE ; pgdir++, address = address + PGDIR_SIZE, offset = 0) {
 415                 pte_t *pg_table;
 416 
 417                 if (pgd_none(*pgdir))
 418                         continue;
 419                 if (pgd_bad(*pgdir)) {
 420                         printk("Bad page directory at address %08lx: %08lx\n", address, pgd_val(*pgdir));
 421                         pgd_clear(pgdir);
 422                         continue;
 423                 }
 424                 pg_table = (pte_t *) pgd_page(*pgdir);
 425                 if (mem_map[MAP_NR((unsigned long) pg_table)] & MAP_PAGE_RESERVED)
 426                         continue;
 427                 pg_table += offset >> PAGE_SHIFT;
 428 
 429                 /*
 430                  * Go through this page table.
 431                  */
 432                 for( ; offset < ~PGDIR_MASK ; pg_table++, offset += PAGE_SIZE) {
 433                         /*
 434                          * Update vma again..
 435                          */
 436                         for (;;) {
 437                                 if (address+offset < vma->vm_end)
 438                                         break;
 439                                 vma = vma->vm_next;
 440                                 if (!vma)
 441                                         return 0;
 442                         }
 443 
 444                         switch(try_to_swap_out(vma, offset+address-vma->vm_start, pg_table)) {
 445                                 case 0:
 446                                         break;
 447 
 448                                 case 1:
 449                                         p->mm->rss--;
 450                                         /* continue with the following page the next time */
 451                                         p->mm->swap_address = address + offset + PAGE_SIZE;
 452                                         return 1;
 453 
 454                                 default:
 455                                         p->mm->rss--;
 456                                         break;
 457                         }
 458                 }
 459         }
 460         /*
 461          * Finish work with this process, if we reached the end of the page
 462          * directory.
 463          */
 464         return 0;
 465 }
 466 
 467 static int swap_out(unsigned int priority)
     /* [previous][next][first][last][top][bottom][index][help] */
 468 {
 469         static int swap_task;
 470         int loop;
 471         int counter = NR_TASKS * 2 >> priority;
 472         struct task_struct *p;
 473 
 474         counter = NR_TASKS * 2 >> priority;
 475         for(; counter >= 0; counter--, swap_task++) {
 476                 /*
 477                  * Check that swap_task is suitable for swapping.  If not, look for
 478                  * the next suitable process.
 479                  */
 480                 loop = 0;
 481                 while(1) {
 482                         if (swap_task >= NR_TASKS) {
 483                                 swap_task = 1;
 484                                 if (loop)
 485                                         /* all processes are unswappable or already swapped out */
 486                                         return 0;
 487                                 loop = 1;
 488                         }
 489 
 490                         p = task[swap_task];
 491                         if (p && p->mm->swappable && p->mm->rss)
 492                                 break;
 493 
 494                         swap_task++;
 495                 }
 496 
 497                 /*
 498                  * Determine the number of pages to swap from this process.
 499                  */
 500                 if (!p->mm->swap_cnt) {
 501                         p->mm->dec_flt = (p->mm->dec_flt * 3) / 4 + p->mm->maj_flt - p->mm->old_maj_flt;
 502                         p->mm->old_maj_flt = p->mm->maj_flt;
 503 
 504                         if (p->mm->dec_flt >= SWAP_RATIO / SWAP_MIN) {
 505                                 p->mm->dec_flt = SWAP_RATIO / SWAP_MIN;
 506                                 p->mm->swap_cnt = SWAP_MIN;
 507                         } else if (p->mm->dec_flt <= SWAP_RATIO / SWAP_MAX)
 508                                 p->mm->swap_cnt = SWAP_MAX;
 509                         else
 510                                 p->mm->swap_cnt = SWAP_RATIO / p->mm->dec_flt;
 511                 }
 512                 if (swap_out_process(p)) {
 513                         if ((--p->mm->swap_cnt) == 0)
 514                                 swap_task++;
 515                         return 1;
 516                 }
 517         }
 518         return 0;
 519 }
 520 
 521 static int try_to_free_page(int priority)
     /* [previous][next][first][last][top][bottom][index][help] */
 522 {
 523         int i=6;
 524 
 525         while (i--) {
 526                 if (priority != GFP_NOBUFFER && shrink_buffers(i))
 527                         return 1;
 528                 if (shm_swap(i))
 529                         return 1;
 530                 if (swap_out(i))
 531                         return 1;
 532         }
 533         return 0;
 534 }
 535 
 536 static inline void add_mem_queue(struct mem_list * head, struct mem_list * entry)
     /* [previous][next][first][last][top][bottom][index][help] */
 537 {
 538         entry->prev = head;
 539         (entry->next = head->next)->prev = entry;
 540         head->next = entry;
 541 }
 542 
 543 static inline void remove_mem_queue(struct mem_list * head, struct mem_list * entry)
     /* [previous][next][first][last][top][bottom][index][help] */
 544 {
 545         entry->next->prev = entry->prev;
 546         entry->prev->next = entry->next;
 547 }
 548 
 549 /*
 550  * Free_page() adds the page to the free lists. This is optimized for
 551  * fast normal cases (no error jumps taken normally).
 552  *
 553  * The way to optimize jumps for gcc-2.2.2 is to:
 554  *  - select the "normal" case and put it inside the if () { XXX }
 555  *  - no else-statements if you can avoid them
 556  *
 557  * With the above two rules, you get a straight-line execution path
 558  * for the normal case, giving better asm-code.
 559  *
 560  * free_page() may sleep since the page being freed may be a buffer
 561  * page or present in the swap cache. It will not sleep, however,
 562  * for a freshly allocated page (get_free_page()).
 563  */
 564 
 565 /*
 566  * Buddy system. Hairy. You really aren't expected to understand this
 567  */
 568 static inline void free_pages_ok(unsigned long addr, unsigned long order)
     /* [previous][next][first][last][top][bottom][index][help] */
 569 {
 570         unsigned long index = MAP_NR(addr) >> (1 + order);
 571         unsigned long mask = PAGE_MASK << order;
 572 
 573         addr &= mask;
 574         nr_free_pages += 1 << order;
 575         while (order < NR_MEM_LISTS-1) {
 576                 if (!change_bit(index, free_area_map[order]))
 577                         break;
 578                 remove_mem_queue(free_area_list+order, (struct mem_list *) (addr ^ (1+~mask)));
 579                 order++;
 580                 index >>= 1;
 581                 mask <<= 1;
 582                 addr &= mask;
 583         }
 584         add_mem_queue(free_area_list+order, (struct mem_list *) addr);
 585 }
 586 
 587 static inline void check_free_buffers(unsigned long addr)
     /* [previous][next][first][last][top][bottom][index][help] */
 588 {
 589         struct buffer_head * bh;
 590 
 591         bh = buffer_pages[MAP_NR(addr)];
 592         if (bh) {
 593                 struct buffer_head *tmp = bh;
 594                 do {
 595                         if (tmp->b_list == BUF_SHARED && tmp->b_dev != 0xffff)
 596                                 refile_buffer(tmp);
 597                         tmp = tmp->b_this_page;
 598                 } while (tmp != bh);
 599         }
 600 }
 601 
 602 void free_pages(unsigned long addr, unsigned long order)
     /* [previous][next][first][last][top][bottom][index][help] */
 603 {
 604         if (addr < high_memory) {
 605                 unsigned long flag;
 606                 mem_map_t * map = mem_map + MAP_NR(addr);
 607                 if (*map) {
 608                         if (!(*map & MAP_PAGE_RESERVED)) {
 609                                 save_flags(flag);
 610                                 cli();
 611                                 if (!--*map)  {
 612                                         free_pages_ok(addr, order);
 613                                         delete_from_swap_cache(addr);
 614                                 }
 615                                 restore_flags(flag);
 616                                 if (*map == 1)
 617                                         check_free_buffers(addr);
 618                         }
 619                         return;
 620                 }
 621                 printk("Trying to free free memory (%08lx): memory probably corrupted\n",addr);
 622                 printk("PC = %p\n", __builtin_return_address(0));
 623                 return;
 624         }
 625 }
 626 
 627 /*
 628  * Some ugly macros to speed up __get_free_pages()..
 629  */
 630 #define RMQUEUE(order) \
 631 do { struct mem_list * queue = free_area_list+order; \
 632      unsigned long new_order = order; \
 633         do { struct mem_list *next = queue->next; \
 634                 if (queue != next) { \
 635                         (queue->next = next->next)->prev = queue; \
 636                         mark_used((unsigned long) next, new_order); \
 637                         nr_free_pages -= 1 << order; \
 638                         restore_flags(flags); \
 639                         EXPAND(next, order, new_order); \
 640                         return (unsigned long) next; \
 641                 } new_order++; queue++; \
 642         } while (new_order < NR_MEM_LISTS); \
 643 } while (0)
 644 
 645 static inline int mark_used(unsigned long addr, unsigned long order)
     /* [previous][next][first][last][top][bottom][index][help] */
 646 {
 647         return change_bit(MAP_NR(addr) >> (1+order), free_area_map[order]);
 648 }
 649 
 650 #define EXPAND(addr,low,high) \
 651 do { unsigned long size = PAGE_SIZE << high; \
 652         while (high > low) { \
 653                 high--; size >>= 1; cli(); \
 654                 add_mem_queue(free_area_list+high, addr); \
 655                 mark_used((unsigned long) addr, high); \
 656                 restore_flags(flags); \
 657                 addr = (struct mem_list *) (size + (unsigned long) addr); \
 658         } mem_map[MAP_NR((unsigned long) addr)] = 1; \
 659 } while (0)
 660 
 661 unsigned long __get_free_pages(int priority, unsigned long order)
     /* [previous][next][first][last][top][bottom][index][help] */
 662 {
 663         unsigned long flags;
 664         int reserved_pages;
 665 
 666         if (intr_count && priority != GFP_ATOMIC) {
 667                 static int count = 0;
 668                 if (++count < 5) {
 669                         printk("gfp called nonatomically from interrupt %p\n",
 670                                 __builtin_return_address(0));
 671                         priority = GFP_ATOMIC;
 672                 }
 673         }
 674         reserved_pages = 5;
 675         if (priority != GFP_NFS)
 676                 reserved_pages = min_free_pages;
 677         save_flags(flags);
 678 repeat:
 679         cli();
 680         if ((priority==GFP_ATOMIC) || nr_free_pages > reserved_pages) {
 681                 RMQUEUE(order);
 682                 restore_flags(flags);
 683                 return 0;
 684         }
 685         restore_flags(flags);
 686         if (priority != GFP_BUFFER && try_to_free_page(priority))
 687                 goto repeat;
 688         return 0;
 689 }
 690 
 691 /*
 692  * Yes, I know this is ugly. Don't tell me.
 693  */
 694 unsigned long __get_dma_pages(int priority, unsigned long order)
     /* [previous][next][first][last][top][bottom][index][help] */
 695 {
 696         unsigned long list = 0;
 697         unsigned long result;
 698         unsigned long limit = MAX_DMA_ADDRESS;
 699 
 700         /* if (EISA_bus) limit = ~0UL; */
 701         if (priority != GFP_ATOMIC)
 702                 priority = GFP_BUFFER;
 703         for (;;) {
 704                 result = __get_free_pages(priority, order);
 705                 if (result < limit) /* covers failure as well */
 706                         break;
 707                 *(unsigned long *) result = list;
 708                 list = result;
 709         }
 710         while (list) {
 711                 unsigned long tmp = list;
 712                 list = *(unsigned long *) list;
 713                 free_pages(tmp, order);
 714         }
 715         return result;
 716 }
 717 
 718 /*
 719  * Show free area list (used inside shift_scroll-lock stuff)
 720  * We also calculate the percentage fragmentation. We do this by counting the
 721  * memory on each free list with the exception of the first item on the list.
 722  */
 723 void show_free_areas(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 724 {
 725         unsigned long order, flags;
 726         unsigned long total = 0;
 727 
 728         printk("Free pages:      %6dkB\n ( ",nr_free_pages<<(PAGE_SHIFT-10));
 729         save_flags(flags);
 730         cli();
 731         for (order=0 ; order < NR_MEM_LISTS; order++) {
 732                 struct mem_list * tmp;
 733                 unsigned long nr = 0;
 734                 for (tmp = free_area_list[order].next ; tmp != free_area_list + order ; tmp = tmp->next) {
 735                         nr ++;
 736                 }
 737                 total += nr * (4 << order);
 738                 printk("%lu*%ukB ", nr, 4 << order);
 739         }
 740         restore_flags(flags);
 741         printk("= %lukB)\n", total);
 742 #ifdef SWAP_CACHE_INFO
 743         show_swap_cache_info();
 744 #endif  
 745 }
 746 
 747 /*
 748  * Trying to stop swapping from a file is fraught with races, so
 749  * we repeat quite a bit here when we have to pause. swapoff()
 750  * isn't exactly timing-critical, so who cares?
 751  */
 752 static int try_to_unuse(unsigned int type)
     /* [previous][next][first][last][top][bottom][index][help] */
 753 {
 754         int nr;
 755         unsigned long tmp = 0;
 756         struct task_struct *p;
 757 
 758         nr = 0; 
 759 /*
 760  * When we have to sleep, we restart the whole algorithm from the same
 761  * task we stopped in. That at least rids us of all races.
 762  */
 763 repeat:
 764         for (; nr < NR_TASKS ; nr++) {
 765                 pgd_t * page_dir;
 766                 int i;
 767 
 768                 p = task[nr];
 769                 if (!p)
 770                         continue;
 771                 page_dir = PAGE_DIR_OFFSET(p, 0);
 772                 for (i = 0 ; i < PTRS_PER_PAGE ; page_dir++, i++) {
 773                         int j;
 774                         pte_t *page_table;
 775 
 776                         if (pgd_none(*page_dir))
 777                                 continue;
 778                         if (pgd_bad(*page_dir)) {
 779                                 printk("bad page directory entry [%d] %08lx\n", i, pgd_val(*page_dir));
 780                                 pgd_clear(page_dir);
 781                                 continue;
 782                         }
 783                         page_table = (pte_t *) pgd_page(*page_dir);
 784                         if (mem_map[MAP_NR((unsigned long) page_table)] & MAP_PAGE_RESERVED)
 785                                 continue;
 786                         for (j = 0 ; j < PTRS_PER_PAGE ; page_table++, j++) {
 787                                 pte_t pte;
 788                                 pte = *page_table;
 789                                 if (pte_none(pte))
 790                                         continue;
 791                                 if (pte_present(pte)) {
 792                                         unsigned long page = pte_page(pte);
 793                                         if (page >= high_memory)
 794                                                 continue;
 795                                         if (!in_swap_cache(page))
 796                                                 continue;
 797                                         if (SWP_TYPE(in_swap_cache(page)) != type)
 798                                                 continue;
 799                                         delete_from_swap_cache(page);
 800                                         *page_table = pte_mkdirty(pte);
 801                                         continue;
 802                                 }
 803                                 if (SWP_TYPE(pte_val(pte)) != type)
 804                                         continue;
 805                                 if (!tmp) {
 806                                         if (!(tmp = __get_free_page(GFP_KERNEL)))
 807                                                 return -ENOMEM;
 808                                         goto repeat;
 809                                 }
 810                                 read_swap_page(pte_val(pte), (char *) tmp);
 811                                 if (pte_val(*page_table) != pte_val(pte))
 812                                         goto repeat;
 813                                 *page_table = pte_mkwrite(pte_mkdirty(mk_pte(tmp, PAGE_COPY)));
 814                                 ++p->mm->rss;
 815                                 swap_free(pte_val(pte));
 816                                 tmp = 0;
 817                         }
 818                 }
 819         }
 820         free_page(tmp);
 821         return 0;
 822 }
 823 
 824 asmlinkage int sys_swapoff(const char * specialfile)
     /* [previous][next][first][last][top][bottom][index][help] */
 825 {
 826         struct swap_info_struct * p;
 827         struct inode * inode;
 828         unsigned int type;
 829         struct file filp;
 830         int i;
 831 
 832         if (!suser())
 833                 return -EPERM;
 834         i = namei(specialfile,&inode);
 835         if (i)
 836                 return i;
 837         p = swap_info;
 838         for (type = 0 ; type < nr_swapfiles ; type++,p++) {
 839                 if ((p->flags & SWP_WRITEOK) != SWP_WRITEOK)
 840                         continue;
 841                 if (p->swap_file) {
 842                         if (p->swap_file == inode)
 843                                 break;
 844                 } else {
 845                         if (!S_ISBLK(inode->i_mode))
 846                                 continue;
 847                         if (p->swap_device == inode->i_rdev)
 848                                 break;
 849                 }
 850         }
 851 
 852         if (type >= nr_swapfiles){
 853                 iput(inode);
 854                 return -EINVAL;
 855         }
 856         p->flags = SWP_USED;
 857         i = try_to_unuse(type);
 858         if (i) {
 859                 iput(inode);
 860                 p->flags = SWP_WRITEOK;
 861                 return i;
 862         }
 863 
 864         if(p->swap_device){
 865                 memset(&filp, 0, sizeof(filp));         
 866                 filp.f_inode = inode;
 867                 filp.f_mode = 3; /* read write */
 868                 /* open it again to get fops */
 869                 if( !blkdev_open(inode, &filp) &&
 870                    filp.f_op && filp.f_op->release){
 871                         filp.f_op->release(inode,&filp);
 872                         filp.f_op->release(inode,&filp);
 873                 }
 874         }
 875         iput(inode);
 876 
 877         nr_swap_pages -= p->pages;
 878         iput(p->swap_file);
 879         p->swap_file = NULL;
 880         p->swap_device = 0;
 881         vfree(p->swap_map);
 882         p->swap_map = NULL;
 883         free_page((long) p->swap_lockmap);
 884         p->swap_lockmap = NULL;
 885         p->flags = 0;
 886         return 0;
 887 }
 888 
 889 /*
 890  * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
 891  *
 892  * The swapon system call
 893  */
 894 asmlinkage int sys_swapon(const char * specialfile)
     /* [previous][next][first][last][top][bottom][index][help] */
 895 {
 896         struct swap_info_struct * p;
 897         struct inode * swap_inode;
 898         unsigned int type;
 899         int i,j;
 900         int error;
 901         struct file filp;
 902 
 903         memset(&filp, 0, sizeof(filp));
 904         if (!suser())
 905                 return -EPERM;
 906         p = swap_info;
 907         for (type = 0 ; type < nr_swapfiles ; type++,p++)
 908                 if (!(p->flags & SWP_USED))
 909                         break;
 910         if (type >= MAX_SWAPFILES)
 911                 return -EPERM;
 912         if (type >= nr_swapfiles)
 913                 nr_swapfiles = type+1;
 914         p->flags = SWP_USED;
 915         p->swap_file = NULL;
 916         p->swap_device = 0;
 917         p->swap_map = NULL;
 918         p->swap_lockmap = NULL;
 919         p->lowest_bit = 0;
 920         p->highest_bit = 0;
 921         p->max = 1;
 922         error = namei(specialfile,&swap_inode);
 923         if (error)
 924                 goto bad_swap_2;
 925         p->swap_file = swap_inode;
 926         error = -EBUSY;
 927         if (swap_inode->i_count != 1)
 928                 goto bad_swap_2;
 929         error = -EINVAL;
 930 
 931         if (S_ISBLK(swap_inode->i_mode)) {
 932                 p->swap_device = swap_inode->i_rdev;
 933 
 934                 filp.f_inode = swap_inode;
 935                 filp.f_mode = 3; /* read write */
 936                 error = blkdev_open(swap_inode, &filp);
 937                 p->swap_file = NULL;
 938                 iput(swap_inode);
 939                 if(error)
 940                         goto bad_swap_2;
 941                 error = -ENODEV;
 942                 if (!p->swap_device)
 943                         goto bad_swap;
 944                 error = -EBUSY;
 945                 for (i = 0 ; i < nr_swapfiles ; i++) {
 946                         if (i == type)
 947                                 continue;
 948                         if (p->swap_device == swap_info[i].swap_device)
 949                                 goto bad_swap;
 950                 }
 951         } else if (!S_ISREG(swap_inode->i_mode))
 952                 goto bad_swap;
 953         p->swap_lockmap = (unsigned char *) get_free_page(GFP_USER);
 954         if (!p->swap_lockmap) {
 955                 printk("Unable to start swapping: out of memory :-)\n");
 956                 error = -ENOMEM;
 957                 goto bad_swap;
 958         }
 959         read_swap_page(SWP_ENTRY(type,0), (char *) p->swap_lockmap);
 960         if (memcmp("SWAP-SPACE",p->swap_lockmap+4086,10)) {
 961                 printk("Unable to find swap-space signature\n");
 962                 error = -EINVAL;
 963                 goto bad_swap;
 964         }
 965         memset(p->swap_lockmap+PAGE_SIZE-10,0,10);
 966         j = 0;
 967         p->lowest_bit = 0;
 968         p->highest_bit = 0;
 969         for (i = 1 ; i < 8*PAGE_SIZE ; i++) {
 970                 if (test_bit(i,p->swap_lockmap)) {
 971                         if (!p->lowest_bit)
 972                                 p->lowest_bit = i;
 973                         p->highest_bit = i;
 974                         p->max = i+1;
 975                         j++;
 976                 }
 977         }
 978         if (!j) {
 979                 printk("Empty swap-file\n");
 980                 error = -EINVAL;
 981                 goto bad_swap;
 982         }
 983         p->swap_map = (unsigned char *) vmalloc(p->max);
 984         if (!p->swap_map) {
 985                 error = -ENOMEM;
 986                 goto bad_swap;
 987         }
 988         for (i = 1 ; i < p->max ; i++) {
 989                 if (test_bit(i,p->swap_lockmap))
 990                         p->swap_map[i] = 0;
 991                 else
 992                         p->swap_map[i] = 0x80;
 993         }
 994         p->swap_map[0] = 0x80;
 995         memset(p->swap_lockmap,0,PAGE_SIZE);
 996         p->flags = SWP_WRITEOK;
 997         p->pages = j;
 998         nr_swap_pages += j;
 999         printk("Adding Swap: %dk swap-space\n",j<<2);
1000         return 0;
1001 bad_swap:
1002         if(filp.f_op && filp.f_op->release)
1003                 filp.f_op->release(filp.f_inode,&filp);
1004 bad_swap_2:
1005         free_page((long) p->swap_lockmap);
1006         vfree(p->swap_map);
1007         iput(p->swap_file);
1008         p->swap_device = 0;
1009         p->swap_file = NULL;
1010         p->swap_map = NULL;
1011         p->swap_lockmap = NULL;
1012         p->flags = 0;
1013         return error;
1014 }
1015 
1016 void si_swapinfo(struct sysinfo *val)
     /* [previous][next][first][last][top][bottom][index][help] */
1017 {
1018         unsigned int i, j;
1019 
1020         val->freeswap = val->totalswap = 0;
1021         for (i = 0; i < nr_swapfiles; i++) {
1022                 if ((swap_info[i].flags & SWP_WRITEOK) != SWP_WRITEOK)
1023                         continue;
1024                 for (j = 0; j < swap_info[i].max; ++j)
1025                         switch (swap_info[i].swap_map[j]) {
1026                                 case 128:
1027                                         continue;
1028                                 case 0:
1029                                         ++val->freeswap;
1030                                 default:
1031                                         ++val->totalswap;
1032                         }
1033         }
1034         val->freeswap <<= PAGE_SHIFT;
1035         val->totalswap <<= PAGE_SHIFT;
1036         return;
1037 }
1038 
1039 /*
1040  * set up the free-area data structures:
1041  *   - mark all pages MAP_PAGE_RESERVED
1042  *   - mark all memory queues empty
1043  *   - clear the memory bitmaps
1044  */
1045 unsigned long free_area_init(unsigned long start_mem, unsigned long end_mem)
     /* [previous][next][first][last][top][bottom][index][help] */
1046 {
1047         mem_map_t * p;
1048         unsigned long mask = PAGE_MASK;
1049         int i;
1050 
1051         /*
1052          * select nr of pages we try to keep free for important stuff
1053          * with a minimum of 16 pages. This is totally arbitrary
1054          */
1055         i = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT+6);
1056         if (i < 16)
1057                 i = 16;
1058         min_free_pages = i;
1059         start_mem = init_swap_cache(start_mem, end_mem);
1060         mem_map = (mem_map_t *) start_mem;
1061         p = mem_map + MAP_NR(end_mem);
1062         start_mem = (unsigned long) p;
1063         while (p > mem_map)
1064                 *--p = MAP_PAGE_RESERVED;
1065 
1066         for (i = 0 ; i < NR_MEM_LISTS ; i++) {
1067                 unsigned long bitmap_size;
1068                 free_area_list[i].prev = free_area_list[i].next = &free_area_list[i];
1069                 mask += mask;
1070                 end_mem = (end_mem + ~mask) & mask;
1071                 bitmap_size = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT + i);
1072                 bitmap_size = (bitmap_size + 7) >> 3;
1073                 bitmap_size = (bitmap_size + sizeof(unsigned long) - 1) & ~(sizeof(unsigned long)-1);
1074                 free_area_map[i] = (unsigned char *) start_mem;
1075                 memset((void *) start_mem, 0, bitmap_size);
1076                 start_mem += bitmap_size;
1077         }
1078         return start_mem;
1079 }

/* [previous][next][first][last][top][bottom][index][help] */