root/mm/swap.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. show_swap_cache_info
  2. add_to_swap_cache
  3. init_swap_cache
  4. rw_swap_page
  5. get_swap_page
  6. swap_duplicate
  7. swap_free
  8. swap_in
  9. try_to_swap_out
  10. swap_out_pmd
  11. swap_out_pgd
  12. swap_out_vma
  13. swap_out_process
  14. swap_out
  15. try_to_free_page
  16. add_mem_queue
  17. remove_mem_queue
  18. free_pages_ok
  19. check_free_buffers
  20. free_pages
  21. mark_used
  22. __get_free_pages
  23. __get_dma_pages
  24. show_free_areas
  25. unuse_pte
  26. unuse_pmd
  27. unuse_pgd
  28. unuse_vma
  29. unuse_process
  30. try_to_unuse
  31. sys_swapoff
  32. sys_swapon
  33. si_swapinfo
  34. free_area_init

   1 #define THREE_LEVEL
   2 /*
   3  *  linux/mm/swap.c
   4  *
   5  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   6  */
   7 
   8 /*
   9  * This file should contain most things doing the swapping from/to disk.
  10  * Started 18.12.91
  11  */
  12 
  13 #include <linux/mm.h>
  14 #include <linux/sched.h>
  15 #include <linux/head.h>
  16 #include <linux/kernel.h>
  17 #include <linux/kernel_stat.h>
  18 #include <linux/errno.h>
  19 #include <linux/string.h>
  20 #include <linux/stat.h>
  21 #include <linux/fs.h>
  22 
  23 #include <asm/dma.h>
  24 #include <asm/system.h> /* for cli()/sti() */
  25 #include <asm/bitops.h>
  26 #include <asm/pgtable.h>
  27 
  28 #define MAX_SWAPFILES 8
  29 
  30 #define SWP_USED        1
  31 #define SWP_WRITEOK     3
  32 
  33 #define SWP_TYPE(entry) (((entry) >> 1) & 0x7f)
  34 #define SWP_OFFSET(entry) ((entry) >> 12)
  35 #define SWP_ENTRY(type,offset) (((type) << 1) | ((offset) << 12))
  36 
  37 int min_free_pages = 20;
  38 
  39 static int nr_swapfiles = 0;
  40 static struct wait_queue * lock_queue = NULL;
  41 
  42 static struct swap_info_struct {
  43         unsigned long flags;
  44         struct inode * swap_file;
  45         unsigned int swap_device;
  46         unsigned char * swap_map;
  47         unsigned char * swap_lockmap;
  48         int pages;
  49         int lowest_bit;
  50         int highest_bit;
  51         unsigned long max;
  52 } swap_info[MAX_SWAPFILES];
  53 
  54 extern int shm_swap (int);
  55 
  56 unsigned long *swap_cache;
  57 
  58 #ifdef SWAP_CACHE_INFO
  59 unsigned long swap_cache_add_total = 0;
  60 unsigned long swap_cache_add_success = 0;
  61 unsigned long swap_cache_del_total = 0;
  62 unsigned long swap_cache_del_success = 0;
  63 unsigned long swap_cache_find_total = 0;
  64 unsigned long swap_cache_find_success = 0;
  65 
  66 extern inline void show_swap_cache_info(void)
     /* [previous][next][first][last][top][bottom][index][help] */
  67 {
  68         printk("Swap cache: add %ld/%ld, delete %ld/%ld, find %ld/%ld\n",
  69                 swap_cache_add_total, swap_cache_add_success, 
  70                 swap_cache_del_total, swap_cache_del_success,
  71                 swap_cache_find_total, swap_cache_find_success);
  72 }
  73 #endif
  74 
  75 static int add_to_swap_cache(unsigned long addr, unsigned long entry)
     /* [previous][next][first][last][top][bottom][index][help] */
  76 {
  77         struct swap_info_struct * p = &swap_info[SWP_TYPE(entry)];
  78 
  79 #ifdef SWAP_CACHE_INFO
  80         swap_cache_add_total++;
  81 #endif
  82         if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
  83                 entry = (unsigned long) xchg_ptr(swap_cache + MAP_NR(addr), (void *) entry);
  84                 if (entry)  {
  85                         printk("swap_cache: replacing non-NULL entry\n");
  86                 }
  87 #ifdef SWAP_CACHE_INFO
  88                 swap_cache_add_success++;
  89 #endif
  90                 return 1;
  91         }
  92         return 0;
  93 }
  94 
  95 static unsigned long init_swap_cache(unsigned long mem_start,
     /* [previous][next][first][last][top][bottom][index][help] */
  96         unsigned long mem_end)
  97 {
  98         unsigned long swap_cache_size;
  99 
 100         mem_start = (mem_start + 15) & ~15;
 101         swap_cache = (unsigned long *) mem_start;
 102         swap_cache_size = MAP_NR(mem_end);
 103         memset(swap_cache, 0, swap_cache_size * sizeof (unsigned long));
 104         return (unsigned long) (swap_cache + swap_cache_size);
 105 }
 106 
 107 void rw_swap_page(int rw, unsigned long entry, char * buf)
     /* [previous][next][first][last][top][bottom][index][help] */
 108 {
 109         unsigned long type, offset;
 110         struct swap_info_struct * p;
 111 
 112         type = SWP_TYPE(entry);
 113         if (type >= nr_swapfiles) {
 114                 printk("Internal error: bad swap-device\n");
 115                 return;
 116         }
 117         p = &swap_info[type];
 118         offset = SWP_OFFSET(entry);
 119         if (offset >= p->max) {
 120                 printk("rw_swap_page: weirdness\n");
 121                 return;
 122         }
 123         if (p->swap_map && !p->swap_map[offset]) {
 124                 printk("Hmm.. Trying to use unallocated swap (%08lx)\n", entry);
 125                 return;
 126         }
 127         if (!(p->flags & SWP_USED)) {
 128                 printk("Trying to swap to unused swap-device\n");
 129                 return;
 130         }
 131         while (set_bit(offset,p->swap_lockmap))
 132                 sleep_on(&lock_queue);
 133         if (rw == READ)
 134                 kstat.pswpin++;
 135         else
 136                 kstat.pswpout++;
 137         if (p->swap_device) {
 138                 ll_rw_page(rw,p->swap_device,offset,buf);
 139         } else if (p->swap_file) {
 140                 struct inode *swapf = p->swap_file;
 141                 unsigned int zones[8];
 142                 int i;
 143                 if (swapf->i_op->bmap == NULL
 144                         && swapf->i_op->smap != NULL){
 145                         /*
 146                                 With MsDOS, we use msdos_smap which return
 147                                 a sector number (not a cluster or block number).
 148                                 It is a patch to enable the UMSDOS project.
 149                                 Other people are working on better solution.
 150 
 151                                 It sounds like ll_rw_swap_file defined
 152                                 it operation size (sector size) based on
 153                                 PAGE_SIZE and the number of block to read.
 154                                 So using bmap or smap should work even if
 155                                 smap will require more blocks.
 156                         */
 157                         int j;
 158                         unsigned int block = offset << 3;
 159 
 160                         for (i=0, j=0; j< PAGE_SIZE ; i++, j += 512){
 161                                 if (!(zones[i] = swapf->i_op->smap(swapf,block++))) {
 162                                         printk("rw_swap_page: bad swap file\n");
 163                                         return;
 164                                 }
 165                         }
 166                 }else{
 167                         int j;
 168                         unsigned int block = offset
 169                                 << (12 - swapf->i_sb->s_blocksize_bits);
 170 
 171                         for (i=0, j=0; j< PAGE_SIZE ; i++, j +=swapf->i_sb->s_blocksize)
 172                                 if (!(zones[i] = bmap(swapf,block++))) {
 173                                         printk("rw_swap_page: bad swap file\n");
 174                                         return;
 175                                 }
 176                 }
 177                 ll_rw_swap_file(rw,swapf->i_dev, zones, i,buf);
 178         } else
 179                 printk("re_swap_page: no swap file or device\n");
 180         if (offset && !clear_bit(offset,p->swap_lockmap))
 181                 printk("rw_swap_page: lock already cleared\n");
 182         wake_up(&lock_queue);
 183 }
 184 
 185 unsigned int get_swap_page(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 186 {
 187         struct swap_info_struct * p;
 188         unsigned int offset, type;
 189 
 190         p = swap_info;
 191         for (type = 0 ; type < nr_swapfiles ; type++,p++) {
 192                 if ((p->flags & SWP_WRITEOK) != SWP_WRITEOK)
 193                         continue;
 194                 for (offset = p->lowest_bit; offset <= p->highest_bit ; offset++) {
 195                         if (p->swap_map[offset])
 196                                 continue;
 197                         if (test_bit(offset, p->swap_lockmap))
 198                                 continue;
 199                         p->swap_map[offset] = 1;
 200                         nr_swap_pages--;
 201                         if (offset == p->highest_bit)
 202                                 p->highest_bit--;
 203                         p->lowest_bit = offset;
 204                         return SWP_ENTRY(type,offset);
 205                 }
 206         }
 207         return 0;
 208 }
 209 
 210 void swap_duplicate(unsigned long entry)
     /* [previous][next][first][last][top][bottom][index][help] */
 211 {
 212         struct swap_info_struct * p;
 213         unsigned long offset, type;
 214 
 215         if (!entry)
 216                 return;
 217         offset = SWP_OFFSET(entry);
 218         type = SWP_TYPE(entry);
 219         if (type == SHM_SWP_TYPE)
 220                 return;
 221         if (type >= nr_swapfiles) {
 222                 printk("Trying to duplicate nonexistent swap-page\n");
 223                 return;
 224         }
 225         p = type + swap_info;
 226         if (offset >= p->max) {
 227                 printk("swap_duplicate: weirdness\n");
 228                 return;
 229         }
 230         if (!p->swap_map[offset]) {
 231                 printk("swap_duplicate: trying to duplicate unused page\n");
 232                 return;
 233         }
 234         p->swap_map[offset]++;
 235         return;
 236 }
 237 
 238 void swap_free(unsigned long entry)
     /* [previous][next][first][last][top][bottom][index][help] */
 239 {
 240         struct swap_info_struct * p;
 241         unsigned long offset, type;
 242 
 243         if (!entry)
 244                 return;
 245         type = SWP_TYPE(entry);
 246         if (type == SHM_SWP_TYPE)
 247                 return;
 248         if (type >= nr_swapfiles) {
 249                 printk("Trying to free nonexistent swap-page\n");
 250                 return;
 251         }
 252         p = & swap_info[type];
 253         offset = SWP_OFFSET(entry);
 254         if (offset >= p->max) {
 255                 printk("swap_free: weirdness\n");
 256                 return;
 257         }
 258         if (!(p->flags & SWP_USED)) {
 259                 printk("Trying to free swap from unused swap-device\n");
 260                 return;
 261         }
 262         if (offset < p->lowest_bit)
 263                 p->lowest_bit = offset;
 264         if (offset > p->highest_bit)
 265                 p->highest_bit = offset;
 266         if (!p->swap_map[offset])
 267                 printk("swap_free: swap-space map bad (entry %08lx)\n",entry);
 268         else
 269                 if (!--p->swap_map[offset])
 270                         nr_swap_pages++;
 271 }
 272 
 273 /*
 274  * The tests may look silly, but it essentially makes sure that
 275  * no other process did a swap-in on us just as we were waiting.
 276  *
 277  * Also, don't bother to add to the swap cache if this page-in
 278  * was due to a write access.
 279  */
 280 void swap_in(struct vm_area_struct * vma, pte_t * page_table,
     /* [previous][next][first][last][top][bottom][index][help] */
 281         unsigned long entry, int write_access)
 282 {
 283         unsigned long page = get_free_page(GFP_KERNEL);
 284 
 285         if (pte_val(*page_table) != entry) {
 286                 free_page(page);
 287                 return;
 288         }
 289         if (!page) {
 290                 *page_table = BAD_PAGE;
 291                 swap_free(entry);
 292                 oom(current);
 293                 return;
 294         }
 295         read_swap_page(entry, (char *) page);
 296         if (pte_val(*page_table) != entry) {
 297                 free_page(page);
 298                 return;
 299         }
 300         vma->vm_task->mm->rss++;
 301         vma->vm_task->mm->maj_flt++;
 302         if (!write_access && add_to_swap_cache(page, entry)) {
 303                 *page_table = mk_pte(page, vma->vm_page_prot);
 304                 return;
 305         }
 306         *page_table = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 307         swap_free(entry);
 308         return;
 309 }
 310 
 311 /*
 312  * The swap-out functions return 1 of they successfully
 313  * threw something out, and we got a free page. It returns
 314  * zero if it couldn't do anything, and any other value
 315  * indicates it decreased rss, but the page was shared.
 316  *
 317  * NOTE! If it sleeps, it *must* return 1 to make sure we
 318  * don't continue with the swap-out. Otherwise we may be
 319  * using a process that no longer actually exists (it might
 320  * have died while we slept).
 321  */
 322 static inline int try_to_swap_out(struct vm_area_struct* vma, unsigned long address, pte_t * page_table)
     /* [previous][next][first][last][top][bottom][index][help] */
 323 {
 324         pte_t pte;
 325         unsigned long entry;
 326         unsigned long page;
 327 
 328         pte = *page_table;
 329         if (!pte_present(pte))
 330                 return 0;
 331         page = pte_page(pte);
 332         if (page >= high_memory)
 333                 return 0;
 334         if (mem_map[MAP_NR(page)] & MAP_PAGE_RESERVED)
 335                 return 0;
 336         if ((pte_dirty(pte) && delete_from_swap_cache(page)) || pte_young(pte))  {
 337                 *page_table = pte_mkold(pte);
 338                 return 0;
 339         }       
 340         if (pte_dirty(pte)) {
 341                 if (mem_map[MAP_NR(page)] != 1)
 342                         return 0;
 343                 if (vma->vm_ops && vma->vm_ops->swapout) {
 344                         vma->vm_task->mm->rss--;
 345                         vma->vm_ops->swapout(vma, address-vma->vm_start, page_table);
 346                 } else {
 347                         if (!(entry = get_swap_page()))
 348                                 return 0;
 349                         vma->vm_task->mm->rss--;
 350                         pte_val(*page_table) = entry;
 351                         invalidate();
 352                         write_swap_page(entry, (char *) page);
 353                 }
 354                 free_page(page);
 355                 return 1;       /* we slept: the process may not exist any more */
 356         }
 357         if ((entry = find_in_swap_cache(page)))  {
 358                 if (mem_map[MAP_NR(page)] != 1) {
 359                         *page_table = pte_mkdirty(pte);
 360                         printk("Aiee.. duplicated cached swap-cache entry\n");
 361                         return 0;
 362                 }
 363                 vma->vm_task->mm->rss--;
 364                 pte_val(*page_table) = entry;
 365                 invalidate();
 366                 free_page(page);
 367                 return 1;
 368         } 
 369         vma->vm_task->mm->rss--;
 370         pte_clear(page_table);
 371         invalidate();
 372         entry = mem_map[MAP_NR(page)];
 373         free_page(page);
 374         return entry;
 375 }
 376 
 377 /*
 378  * A new implementation of swap_out().  We do not swap complete processes,
 379  * but only a small number of blocks, before we continue with the next
 380  * process.  The number of blocks actually swapped is determined on the
 381  * number of page faults, that this process actually had in the last time,
 382  * so we won't swap heavily used processes all the time ...
 383  *
 384  * Note: the priority argument is a hint on much CPU to waste with the
 385  *       swap block search, not a hint, of how much blocks to swap with
 386  *       each process.
 387  *
 388  * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
 389  */
 390 
 391 /*
 392  * These are the minimum and maximum number of pages to swap from one process,
 393  * before proceeding to the next:
 394  */
 395 #define SWAP_MIN        4
 396 #define SWAP_MAX        32
 397 
 398 /*
 399  * The actual number of pages to swap is determined as:
 400  * SWAP_RATIO / (number of recent major page faults)
 401  */
 402 #define SWAP_RATIO      128
 403 
 404 static inline int swap_out_pmd(struct vm_area_struct * vma, pmd_t *dir,
     /* [previous][next][first][last][top][bottom][index][help] */
 405         unsigned long address, unsigned long end)
 406 {
 407         pte_t * pte;
 408         unsigned long pmd_end;
 409 
 410         if (pmd_none(*dir))
 411                 return 0;
 412         if (pmd_bad(*dir)) {
 413                 printk("swap_out_pmd: bad pmd (%08lx)\n", pmd_val(*dir));
 414                 pmd_clear(dir);
 415                 return 0;
 416         }
 417         
 418         pte = pte_offset(dir, address);
 419         
 420         pmd_end = (address + PMD_SIZE) & PMD_MASK;
 421         if (end > pmd_end)
 422                 end = pmd_end;
 423 
 424         do {
 425                 int result;
 426                 vma->vm_task->mm->swap_address = address + PAGE_SIZE;
 427                 result = try_to_swap_out(vma, address, pte);
 428                 if (result)
 429                         return result;
 430                 address += PAGE_SIZE;
 431                 pte++;
 432         } while (address < end);
 433         return 0;
 434 }
 435 
 436 static inline int swap_out_pgd(struct vm_area_struct * vma, pgd_t *dir,
     /* [previous][next][first][last][top][bottom][index][help] */
 437         unsigned long address, unsigned long end)
 438 {
 439         pmd_t * pmd;
 440         unsigned long pgd_end;
 441 
 442         if (pgd_none(*dir))
 443                 return 0;
 444         if (pgd_bad(*dir)) {
 445                 printk("swap_out_pgd: bad pgd (%08lx)\n", pgd_val(*dir));
 446                 pgd_clear(dir);
 447                 return 0;
 448         }
 449 
 450         pmd = pmd_offset(dir, address);
 451 
 452         pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;  
 453         if (end > pgd_end)
 454                 end = pgd_end;
 455         
 456         do {
 457                 int result = swap_out_pmd(vma, pmd, address, end);
 458                 if (result)
 459                         return result;
 460                 address = (address + PMD_SIZE) & PMD_MASK;
 461                 pmd++;
 462         } while (address < end);
 463         return 0;
 464 }
 465 
 466 static int swap_out_vma(struct vm_area_struct * vma, pgd_t *pgdir,
     /* [previous][next][first][last][top][bottom][index][help] */
 467         unsigned long start)
 468 {
 469         unsigned long end;
 470 
 471         /* Don't swap out areas like shared memory which have their
 472             own separate swapping mechanism. */
 473         if (vma->vm_flags & VM_SHM)
 474                 return 0;
 475 
 476         end = vma->vm_end;
 477         while (start < end) {
 478                 int result = swap_out_pgd(vma, pgdir, start, end);
 479                 if (result)
 480                         return result;
 481                 start = (start + PGDIR_SIZE) & PGDIR_MASK;
 482                 pgdir++;
 483         }
 484         return 0;
 485 }
 486 
 487 static int swap_out_process(struct task_struct * p)
     /* [previous][next][first][last][top][bottom][index][help] */
 488 {
 489         unsigned long address;
 490         struct vm_area_struct* vma;
 491 
 492         /*
 493          * Go through process' page directory.
 494          */
 495         address = p->mm->swap_address;
 496         p->mm->swap_address = 0;
 497 
 498         /*
 499          * Find the proper vm-area
 500          */
 501         vma = find_vma(p, address);
 502         if (!vma)
 503                 return 0;
 504         if (address < vma->vm_start)
 505                 address = vma->vm_start;
 506 
 507         for (;;) {
 508                 int result = swap_out_vma(vma, pgd_offset(p, address), address);
 509                 if (result)
 510                         return result;
 511                 vma = vma->vm_next;
 512                 if (!vma)
 513                         break;
 514                 address = vma->vm_start;
 515         }
 516         p->mm->swap_address = 0;
 517         return 0;
 518 }
 519 
 520 static int swap_out(unsigned int priority)
     /* [previous][next][first][last][top][bottom][index][help] */
 521 {
 522         static int swap_task;
 523         int loop, counter;
 524         struct task_struct *p;
 525 
 526         counter = 2*nr_tasks >> priority;
 527         for(; counter >= 0; counter--) {
 528                 /*
 529                  * Check that swap_task is suitable for swapping.  If not, look for
 530                  * the next suitable process.
 531                  */
 532                 loop = 0;
 533                 while(1) {
 534                         if (swap_task >= NR_TASKS) {
 535                                 swap_task = 1;
 536                                 if (loop)
 537                                         /* all processes are unswappable or already swapped out */
 538                                         return 0;
 539                                 loop = 1;
 540                         }
 541 
 542                         p = task[swap_task];
 543                         if (p && p->mm->swappable && p->mm->rss)
 544                                 break;
 545 
 546                         swap_task++;
 547                 }
 548 
 549                 /*
 550                  * Determine the number of pages to swap from this process.
 551                  */
 552                 if (!p->mm->swap_cnt) {
 553                         p->mm->dec_flt = (p->mm->dec_flt * 3) / 4 + p->mm->maj_flt - p->mm->old_maj_flt;
 554                         p->mm->old_maj_flt = p->mm->maj_flt;
 555 
 556                         if (p->mm->dec_flt >= SWAP_RATIO / SWAP_MIN) {
 557                                 p->mm->dec_flt = SWAP_RATIO / SWAP_MIN;
 558                                 p->mm->swap_cnt = SWAP_MIN;
 559                         } else if (p->mm->dec_flt <= SWAP_RATIO / SWAP_MAX)
 560                                 p->mm->swap_cnt = SWAP_MAX;
 561                         else
 562                                 p->mm->swap_cnt = SWAP_RATIO / p->mm->dec_flt;
 563                 }
 564                 if (!--p->mm->swap_cnt)
 565                         swap_task++;
 566                 switch (swap_out_process(p)) {
 567                         case 0:
 568                                 if (p->mm->swap_cnt)
 569                                         swap_task++;
 570                                 break;
 571                         case 1:
 572                                 return 1;
 573                         default:
 574                                 break;
 575                 }
 576         }
 577         return 0;
 578 }
 579 
 580 /*
 581  * we keep on shrinking one resource until it's considered "too hard",
 582  * and then switch to the next one (priority being an indication on how
 583  * hard we should try with the resource).
 584  *
 585  * This should automatically find the resource that can most easily be
 586  * free'd, so hopefully we'll get reasonable behaviour even under very
 587  * different circumstances.
 588  */
 589 static int try_to_free_page(int priority)
     /* [previous][next][first][last][top][bottom][index][help] */
 590 {
 591         static int state = 0;
 592         int i=6;
 593 
 594         switch (state) {
 595                 do {
 596                 case 0:
 597                         if (priority != GFP_NOBUFFER && shrink_buffers(i))
 598                                 return 1;
 599                         state = 1;
 600                 case 1:
 601                         if (shm_swap(i))
 602                                 return 1;
 603                         state = 2;
 604                 default:
 605                         if (swap_out(i))
 606                                 return 1;
 607                         state = 0;
 608                 } while(--i);
 609         }
 610         return 0;
 611 }
 612 
 613 static inline void add_mem_queue(struct mem_list * head, struct mem_list * entry)
     /* [previous][next][first][last][top][bottom][index][help] */
 614 {
 615         entry->prev = head;
 616         (entry->next = head->next)->prev = entry;
 617         head->next = entry;
 618 }
 619 
 620 static inline void remove_mem_queue(struct mem_list * head, struct mem_list * entry)
     /* [previous][next][first][last][top][bottom][index][help] */
 621 {
 622         entry->next->prev = entry->prev;
 623         entry->prev->next = entry->next;
 624 }
 625 
 626 /*
 627  * Free_page() adds the page to the free lists. This is optimized for
 628  * fast normal cases (no error jumps taken normally).
 629  *
 630  * The way to optimize jumps for gcc-2.2.2 is to:
 631  *  - select the "normal" case and put it inside the if () { XXX }
 632  *  - no else-statements if you can avoid them
 633  *
 634  * With the above two rules, you get a straight-line execution path
 635  * for the normal case, giving better asm-code.
 636  *
 637  * free_page() may sleep since the page being freed may be a buffer
 638  * page or present in the swap cache. It will not sleep, however,
 639  * for a freshly allocated page (get_free_page()).
 640  */
 641 
 642 /*
 643  * Buddy system. Hairy. You really aren't expected to understand this
 644  */
 645 static inline void free_pages_ok(unsigned long addr, unsigned long order)
     /* [previous][next][first][last][top][bottom][index][help] */
 646 {
 647         unsigned long index = MAP_NR(addr) >> (1 + order);
 648         unsigned long mask = PAGE_MASK << order;
 649 
 650         addr &= mask;
 651         nr_free_pages += 1 << order;
 652         while (order < NR_MEM_LISTS-1) {
 653                 if (!change_bit(index, free_area_map[order]))
 654                         break;
 655                 remove_mem_queue(free_area_list+order, (struct mem_list *) (addr ^ (1+~mask)));
 656                 order++;
 657                 index >>= 1;
 658                 mask <<= 1;
 659                 addr &= mask;
 660         }
 661         add_mem_queue(free_area_list+order, (struct mem_list *) addr);
 662 }
 663 
 664 static inline void check_free_buffers(unsigned long addr)
     /* [previous][next][first][last][top][bottom][index][help] */
 665 {
 666         struct buffer_head * bh;
 667 
 668         bh = buffer_pages[MAP_NR(addr)];
 669         if (bh) {
 670                 struct buffer_head *tmp = bh;
 671                 do {
 672                         if (tmp->b_list == BUF_SHARED && tmp->b_dev != 0xffff)
 673                                 refile_buffer(tmp);
 674                         tmp = tmp->b_this_page;
 675                 } while (tmp != bh);
 676         }
 677 }
 678 
 679 void free_pages(unsigned long addr, unsigned long order)
     /* [previous][next][first][last][top][bottom][index][help] */
 680 {
 681         if (addr < high_memory) {
 682                 unsigned long flag;
 683                 mem_map_t * map = mem_map + MAP_NR(addr);
 684                 if (*map) {
 685                         if (!(*map & MAP_PAGE_RESERVED)) {
 686                                 save_flags(flag);
 687                                 cli();
 688                                 if (!--*map)  {
 689                                         free_pages_ok(addr, order);
 690                                         delete_from_swap_cache(addr);
 691                                 }
 692                                 restore_flags(flag);
 693                                 if (*map == 1)
 694                                         check_free_buffers(addr);
 695                         }
 696                         return;
 697                 }
 698                 printk("Trying to free free memory (%08lx): memory probably corrupted\n",addr);
 699                 printk("PC = %p\n", __builtin_return_address(0));
 700                 return;
 701         }
 702 }
 703 
 704 /*
 705  * Some ugly macros to speed up __get_free_pages()..
 706  */
 707 #define RMQUEUE(order) \
 708 do { struct mem_list * queue = free_area_list+order; \
 709      unsigned long new_order = order; \
 710         do { struct mem_list *next = queue->next; \
 711                 if (queue != next) { \
 712                         (queue->next = next->next)->prev = queue; \
 713                         mark_used((unsigned long) next, new_order); \
 714                         nr_free_pages -= 1 << order; \
 715                         restore_flags(flags); \
 716                         EXPAND(next, order, new_order); \
 717                         return (unsigned long) next; \
 718                 } new_order++; queue++; \
 719         } while (new_order < NR_MEM_LISTS); \
 720 } while (0)
 721 
 722 static inline int mark_used(unsigned long addr, unsigned long order)
     /* [previous][next][first][last][top][bottom][index][help] */
 723 {
 724         return change_bit(MAP_NR(addr) >> (1+order), free_area_map[order]);
 725 }
 726 
 727 #define EXPAND(addr,low,high) \
 728 do { unsigned long size = PAGE_SIZE << high; \
 729         while (high > low) { \
 730                 high--; size >>= 1; cli(); \
 731                 add_mem_queue(free_area_list+high, addr); \
 732                 mark_used((unsigned long) addr, high); \
 733                 restore_flags(flags); \
 734                 addr = (struct mem_list *) (size + (unsigned long) addr); \
 735         } mem_map[MAP_NR((unsigned long) addr)] = 1; \
 736 } while (0)
 737 
 738 unsigned long __get_free_pages(int priority, unsigned long order)
     /* [previous][next][first][last][top][bottom][index][help] */
 739 {
 740         unsigned long flags;
 741         int reserved_pages;
 742 
 743         if (intr_count && priority != GFP_ATOMIC) {
 744                 static int count = 0;
 745                 if (++count < 5) {
 746                         printk("gfp called nonatomically from interrupt %p\n",
 747                                 __builtin_return_address(0));
 748                         priority = GFP_ATOMIC;
 749                 }
 750         }
 751         reserved_pages = 5;
 752         if (priority != GFP_NFS)
 753                 reserved_pages = min_free_pages;
 754         save_flags(flags);
 755 repeat:
 756         cli();
 757         if ((priority==GFP_ATOMIC) || nr_free_pages > reserved_pages) {
 758                 RMQUEUE(order);
 759                 restore_flags(flags);
 760                 return 0;
 761         }
 762         restore_flags(flags);
 763         if (priority != GFP_BUFFER && try_to_free_page(priority))
 764                 goto repeat;
 765         return 0;
 766 }
 767 
 768 /*
 769  * Yes, I know this is ugly. Don't tell me.
 770  */
 771 unsigned long __get_dma_pages(int priority, unsigned long order)
     /* [previous][next][first][last][top][bottom][index][help] */
 772 {
 773         unsigned long list = 0;
 774         unsigned long result;
 775         unsigned long limit = MAX_DMA_ADDRESS;
 776 
 777         /* if (EISA_bus) limit = ~0UL; */
 778         if (priority != GFP_ATOMIC)
 779                 priority = GFP_BUFFER;
 780         for (;;) {
 781                 result = __get_free_pages(priority, order);
 782                 if (result < limit) /* covers failure as well */
 783                         break;
 784                 *(unsigned long *) result = list;
 785                 list = result;
 786         }
 787         while (list) {
 788                 unsigned long tmp = list;
 789                 list = *(unsigned long *) list;
 790                 free_pages(tmp, order);
 791         }
 792         return result;
 793 }
 794 
 795 /*
 796  * Show free area list (used inside shift_scroll-lock stuff)
 797  * We also calculate the percentage fragmentation. We do this by counting the
 798  * memory on each free list with the exception of the first item on the list.
 799  */
 800 void show_free_areas(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 801 {
 802         unsigned long order, flags;
 803         unsigned long total = 0;
 804 
 805         printk("Free pages:      %6dkB\n ( ",nr_free_pages<<(PAGE_SHIFT-10));
 806         save_flags(flags);
 807         cli();
 808         for (order=0 ; order < NR_MEM_LISTS; order++) {
 809                 struct mem_list * tmp;
 810                 unsigned long nr = 0;
 811                 for (tmp = free_area_list[order].next ; tmp != free_area_list + order ; tmp = tmp->next) {
 812                         nr ++;
 813                 }
 814                 total += nr * ((PAGE_SIZE>>10) << order);
 815                 printk("%lu*%lukB ", nr, (PAGE_SIZE>>10) << order);
 816         }
 817         restore_flags(flags);
 818         printk("= %lukB)\n", total);
 819 #ifdef SWAP_CACHE_INFO
 820         show_swap_cache_info();
 821 #endif  
 822 }
 823 
 824 /*
 825  * Trying to stop swapping from a file is fraught with races, so
 826  * we repeat quite a bit here when we have to pause. swapoff()
 827  * isn't exactly timing-critical, so who cares (but this is /really/
 828  * inefficient, ugh).
 829  *
 830  * We return 1 after having slept, which makes the process start over
 831  * from the beginning for this process..
 832  */
 833 static inline int unuse_pte(struct vm_area_struct * vma, unsigned long address,
     /* [previous][next][first][last][top][bottom][index][help] */
 834         pte_t *dir, unsigned int type, unsigned long page)
 835 {
 836         pte_t pte = *dir;
 837 
 838         if (pte_none(pte))
 839                 return 0;
 840         if (pte_present(pte)) {
 841                 unsigned long page = pte_page(pte);
 842                 if (page >= high_memory)
 843                         return 0;
 844                 if (!in_swap_cache(page))
 845                         return 0;
 846                 if (SWP_TYPE(in_swap_cache(page)) != type)
 847                         return 0;
 848                 delete_from_swap_cache(page);
 849                 *dir = pte_mkdirty(pte);
 850                 return 0;
 851         }
 852         if (SWP_TYPE(pte_val(pte)) != type)
 853                 return 0;
 854         read_swap_page(pte_val(pte), (char *) page);
 855         if (pte_val(*dir) != pte_val(pte)) {
 856                 free_page(page);
 857                 return 1;
 858         }
 859         *dir = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 860         ++vma->vm_task->mm->rss;
 861         swap_free(pte_val(pte));
 862         return 1;
 863 }
 864 
 865 static inline int unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
     /* [previous][next][first][last][top][bottom][index][help] */
 866         unsigned long address, unsigned long size, unsigned long offset,
 867         unsigned int type, unsigned long page)
 868 {
 869         pte_t * pte;
 870         unsigned long end;
 871 
 872         if (pmd_none(*dir))
 873                 return 0;
 874         if (pmd_bad(*dir)) {
 875                 printk("unuse_pmd: bad pmd (%08lx)\n", pmd_val(*dir));
 876                 pmd_clear(dir);
 877                 return 0;
 878         }
 879         pte = pte_offset(dir, address);
 880         offset += address & PMD_MASK;
 881         address &= ~PMD_MASK;
 882         end = address + size;
 883         if (end > PMD_SIZE)
 884                 end = PMD_SIZE;
 885         do {
 886                 if (unuse_pte(vma, offset+address-vma->vm_start, pte, type, page))
 887                         return 1;
 888                 address += PAGE_SIZE;
 889                 pte++;
 890         } while (address < end);
 891         return 0;
 892 }
 893 
 894 static inline int unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
     /* [previous][next][first][last][top][bottom][index][help] */
 895         unsigned long address, unsigned long size,
 896         unsigned int type, unsigned long page)
 897 {
 898         pmd_t * pmd;
 899         unsigned long offset, end;
 900 
 901         if (pgd_none(*dir))
 902                 return 0;
 903         if (pgd_bad(*dir)) {
 904                 printk("unuse_pgd: bad pgd (%08lx)\n", pgd_val(*dir));
 905                 pgd_clear(dir);
 906                 return 0;
 907         }
 908         pmd = pmd_offset(dir, address);
 909         offset = address & PGDIR_MASK;
 910         address &= ~PGDIR_MASK;
 911         end = address + size;
 912         if (end > PGDIR_SIZE)
 913                 end = PGDIR_SIZE;
 914         do {
 915                 if (unuse_pmd(vma, pmd, address, end - address, offset, type, page))
 916                         return 1;
 917                 address = (address + PMD_SIZE) & PMD_MASK;
 918                 pmd++;
 919         } while (address < end);
 920         return 0;
 921 }
 922 
 923 static int unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir,
     /* [previous][next][first][last][top][bottom][index][help] */
 924         unsigned long start, unsigned long end,
 925         unsigned int type, unsigned long page)
 926 {
 927         while (start < end) {
 928                 if (unuse_pgd(vma, pgdir, start, end - start, type, page))
 929                         return 1;
 930                 start = (start + PGDIR_SIZE) & PGDIR_MASK;
 931                 pgdir++;
 932         }
 933         return 0;
 934 }
 935 
 936 static int unuse_process(struct task_struct * p, unsigned int type, unsigned long page)
     /* [previous][next][first][last][top][bottom][index][help] */
 937 {
 938         struct vm_area_struct* vma;
 939 
 940         /*
 941          * Go through process' page directory.
 942          */
 943         vma = p->mm->mmap;
 944         while (vma) {
 945                 pgd_t * pgd = pgd_offset(p, vma->vm_start);
 946                 if (unuse_vma(vma, pgd, vma->vm_start, vma->vm_end, type, page))
 947                         return 1;
 948                 vma = vma->vm_next;
 949         }
 950         return 0;
 951 }
 952 
 953 /*
 954  * To avoid races, we repeat for each process after having
 955  * swapped something in. That gets rid of a few pesky races,
 956  * and "swapoff" isn't exactly timing critical.
 957  */
 958 static int try_to_unuse(unsigned int type)
     /* [previous][next][first][last][top][bottom][index][help] */
 959 {
 960         int nr;
 961         unsigned long page = get_free_page(GFP_KERNEL);
 962 
 963         if (!page)
 964                 return -ENOMEM;
 965         nr = 0;
 966         while (nr < NR_TASKS) {
 967                 if (task[nr]) {
 968                         if (unuse_process(task[nr], type, page)) {
 969                                 page = get_free_page(GFP_KERNEL);
 970                                 if (!page)
 971                                         return -ENOMEM;
 972                                 continue;
 973                         }
 974                 }
 975                 nr++;
 976         }
 977         free_page(page);
 978         return 0;
 979 }
 980 
 981 asmlinkage int sys_swapoff(const char * specialfile)
     /* [previous][next][first][last][top][bottom][index][help] */
 982 {
 983         struct swap_info_struct * p;
 984         struct inode * inode;
 985         unsigned int type;
 986         struct file filp;
 987         int i;
 988 
 989         if (!suser())
 990                 return -EPERM;
 991         i = namei(specialfile,&inode);
 992         if (i)
 993                 return i;
 994         p = swap_info;
 995         for (type = 0 ; type < nr_swapfiles ; type++,p++) {
 996                 if ((p->flags & SWP_WRITEOK) != SWP_WRITEOK)
 997                         continue;
 998                 if (p->swap_file) {
 999                         if (p->swap_file == inode)
1000                                 break;
1001                 } else {
1002                         if (!S_ISBLK(inode->i_mode))
1003                                 continue;
1004                         if (p->swap_device == inode->i_rdev)
1005                                 break;
1006                 }
1007         }
1008 
1009         if (type >= nr_swapfiles){
1010                 iput(inode);
1011                 return -EINVAL;
1012         }
1013         p->flags = SWP_USED;
1014         i = try_to_unuse(type);
1015         if (i) {
1016                 iput(inode);
1017                 p->flags = SWP_WRITEOK;
1018                 return i;
1019         }
1020 
1021         if(p->swap_device){
1022                 memset(&filp, 0, sizeof(filp));         
1023                 filp.f_inode = inode;
1024                 filp.f_mode = 3; /* read write */
1025                 /* open it again to get fops */
1026                 if( !blkdev_open(inode, &filp) &&
1027                    filp.f_op && filp.f_op->release){
1028                         filp.f_op->release(inode,&filp);
1029                         filp.f_op->release(inode,&filp);
1030                 }
1031         }
1032         iput(inode);
1033 
1034         nr_swap_pages -= p->pages;
1035         iput(p->swap_file);
1036         p->swap_file = NULL;
1037         p->swap_device = 0;
1038         vfree(p->swap_map);
1039         p->swap_map = NULL;
1040         free_page((long) p->swap_lockmap);
1041         p->swap_lockmap = NULL;
1042         p->flags = 0;
1043         return 0;
1044 }
1045 
1046 /*
1047  * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
1048  *
1049  * The swapon system call
1050  */
1051 asmlinkage int sys_swapon(const char * specialfile)
     /* [previous][next][first][last][top][bottom][index][help] */
1052 {
1053         struct swap_info_struct * p;
1054         struct inode * swap_inode;
1055         unsigned int type;
1056         int i,j;
1057         int error;
1058         struct file filp;
1059 
1060         memset(&filp, 0, sizeof(filp));
1061         if (!suser())
1062                 return -EPERM;
1063         p = swap_info;
1064         for (type = 0 ; type < nr_swapfiles ; type++,p++)
1065                 if (!(p->flags & SWP_USED))
1066                         break;
1067         if (type >= MAX_SWAPFILES)
1068                 return -EPERM;
1069         if (type >= nr_swapfiles)
1070                 nr_swapfiles = type+1;
1071         p->flags = SWP_USED;
1072         p->swap_file = NULL;
1073         p->swap_device = 0;
1074         p->swap_map = NULL;
1075         p->swap_lockmap = NULL;
1076         p->lowest_bit = 0;
1077         p->highest_bit = 0;
1078         p->max = 1;
1079         error = namei(specialfile,&swap_inode);
1080         if (error)
1081                 goto bad_swap_2;
1082         p->swap_file = swap_inode;
1083         error = -EBUSY;
1084         if (swap_inode->i_count != 1)
1085                 goto bad_swap_2;
1086         error = -EINVAL;
1087 
1088         if (S_ISBLK(swap_inode->i_mode)) {
1089                 p->swap_device = swap_inode->i_rdev;
1090 
1091                 filp.f_inode = swap_inode;
1092                 filp.f_mode = 3; /* read write */
1093                 error = blkdev_open(swap_inode, &filp);
1094                 p->swap_file = NULL;
1095                 iput(swap_inode);
1096                 if(error)
1097                         goto bad_swap_2;
1098                 error = -ENODEV;
1099                 if (!p->swap_device)
1100                         goto bad_swap;
1101                 error = -EBUSY;
1102                 for (i = 0 ; i < nr_swapfiles ; i++) {
1103                         if (i == type)
1104                                 continue;
1105                         if (p->swap_device == swap_info[i].swap_device)
1106                                 goto bad_swap;
1107                 }
1108         } else if (!S_ISREG(swap_inode->i_mode))
1109                 goto bad_swap;
1110         p->swap_lockmap = (unsigned char *) get_free_page(GFP_USER);
1111         if (!p->swap_lockmap) {
1112                 printk("Unable to start swapping: out of memory :-)\n");
1113                 error = -ENOMEM;
1114                 goto bad_swap;
1115         }
1116         read_swap_page(SWP_ENTRY(type,0), (char *) p->swap_lockmap);
1117         if (memcmp("SWAP-SPACE",p->swap_lockmap+4086,10)) {
1118                 printk("Unable to find swap-space signature\n");
1119                 error = -EINVAL;
1120                 goto bad_swap;
1121         }
1122         memset(p->swap_lockmap+PAGE_SIZE-10,0,10);
1123         j = 0;
1124         p->lowest_bit = 0;
1125         p->highest_bit = 0;
1126         for (i = 1 ; i < 8*PAGE_SIZE ; i++) {
1127                 if (test_bit(i,p->swap_lockmap)) {
1128                         if (!p->lowest_bit)
1129                                 p->lowest_bit = i;
1130                         p->highest_bit = i;
1131                         p->max = i+1;
1132                         j++;
1133                 }
1134         }
1135         if (!j) {
1136                 printk("Empty swap-file\n");
1137                 error = -EINVAL;
1138                 goto bad_swap;
1139         }
1140         p->swap_map = (unsigned char *) vmalloc(p->max);
1141         if (!p->swap_map) {
1142                 error = -ENOMEM;
1143                 goto bad_swap;
1144         }
1145         for (i = 1 ; i < p->max ; i++) {
1146                 if (test_bit(i,p->swap_lockmap))
1147                         p->swap_map[i] = 0;
1148                 else
1149                         p->swap_map[i] = 0x80;
1150         }
1151         p->swap_map[0] = 0x80;
1152         memset(p->swap_lockmap,0,PAGE_SIZE);
1153         p->flags = SWP_WRITEOK;
1154         p->pages = j;
1155         nr_swap_pages += j;
1156         printk("Adding Swap: %dk swap-space\n",j<<2);
1157         return 0;
1158 bad_swap:
1159         if(filp.f_op && filp.f_op->release)
1160                 filp.f_op->release(filp.f_inode,&filp);
1161 bad_swap_2:
1162         free_page((long) p->swap_lockmap);
1163         vfree(p->swap_map);
1164         iput(p->swap_file);
1165         p->swap_device = 0;
1166         p->swap_file = NULL;
1167         p->swap_map = NULL;
1168         p->swap_lockmap = NULL;
1169         p->flags = 0;
1170         return error;
1171 }
1172 
1173 void si_swapinfo(struct sysinfo *val)
     /* [previous][next][first][last][top][bottom][index][help] */
1174 {
1175         unsigned int i, j;
1176 
1177         val->freeswap = val->totalswap = 0;
1178         for (i = 0; i < nr_swapfiles; i++) {
1179                 if ((swap_info[i].flags & SWP_WRITEOK) != SWP_WRITEOK)
1180                         continue;
1181                 for (j = 0; j < swap_info[i].max; ++j)
1182                         switch (swap_info[i].swap_map[j]) {
1183                                 case 128:
1184                                         continue;
1185                                 case 0:
1186                                         ++val->freeswap;
1187                                 default:
1188                                         ++val->totalswap;
1189                         }
1190         }
1191         val->freeswap <<= PAGE_SHIFT;
1192         val->totalswap <<= PAGE_SHIFT;
1193         return;
1194 }
1195 
1196 /*
1197  * set up the free-area data structures:
1198  *   - mark all pages MAP_PAGE_RESERVED
1199  *   - mark all memory queues empty
1200  *   - clear the memory bitmaps
1201  */
1202 unsigned long free_area_init(unsigned long start_mem, unsigned long end_mem)
     /* [previous][next][first][last][top][bottom][index][help] */
1203 {
1204         mem_map_t * p;
1205         unsigned long mask = PAGE_MASK;
1206         int i;
1207 
1208         /*
1209          * select nr of pages we try to keep free for important stuff
1210          * with a minimum of 16 pages. This is totally arbitrary
1211          */
1212         i = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT+6);
1213         if (i < 16)
1214                 i = 16;
1215         min_free_pages = i;
1216         start_mem = init_swap_cache(start_mem, end_mem);
1217         mem_map = (mem_map_t *) start_mem;
1218         p = mem_map + MAP_NR(end_mem);
1219         start_mem = (unsigned long) p;
1220         while (p > mem_map)
1221                 *--p = MAP_PAGE_RESERVED;
1222 
1223         for (i = 0 ; i < NR_MEM_LISTS ; i++) {
1224                 unsigned long bitmap_size;
1225                 free_area_list[i].prev = free_area_list[i].next = &free_area_list[i];
1226                 mask += mask;
1227                 end_mem = (end_mem + ~mask) & mask;
1228                 bitmap_size = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT + i);
1229                 bitmap_size = (bitmap_size + 7) >> 3;
1230                 bitmap_size = (bitmap_size + sizeof(unsigned long) - 1) & ~(sizeof(unsigned long)-1);
1231                 free_area_map[i] = (unsigned char *) start_mem;
1232                 memset((void *) start_mem, 0, bitmap_size);
1233                 start_mem += bitmap_size;
1234         }
1235         return start_mem;
1236 }

/* [previous][next][first][last][top][bottom][index][help] */