root/mm/swap.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. show_swap_cache_info
  2. add_to_swap_cache
  3. init_swap_cache
  4. rw_swap_page
  5. get_swap_page
  6. swap_duplicate
  7. swap_free
  8. swap_in
  9. try_to_swap_out
  10. swap_out_pmd
  11. swap_out_pgd
  12. swap_out_vma
  13. swap_out_process
  14. swap_out
  15. try_to_free_page
  16. add_mem_queue
  17. remove_mem_queue
  18. free_pages_ok
  19. check_free_buffers
  20. free_pages
  21. mark_used
  22. __get_free_pages
  23. __get_dma_pages
  24. show_free_areas
  25. unuse_pte
  26. unuse_pmd
  27. unuse_pgd
  28. unuse_vma
  29. unuse_process
  30. try_to_unuse
  31. sys_swapoff
  32. sys_swapon
  33. si_swapinfo
  34. free_area_init

   1 /*
   2  *  linux/mm/swap.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  */
   6 
   7 /*
   8  * This file should contain most things doing the swapping from/to disk.
   9  * Started 18.12.91
  10  */
  11 
  12 #include <linux/mm.h>
  13 #include <linux/sched.h>
  14 #include <linux/head.h>
  15 #include <linux/kernel.h>
  16 #include <linux/kernel_stat.h>
  17 #include <linux/errno.h>
  18 #include <linux/string.h>
  19 #include <linux/stat.h>
  20 #include <linux/fs.h>
  21 
  22 #include <asm/dma.h>
  23 #include <asm/system.h> /* for cli()/sti() */
  24 #include <asm/bitops.h>
  25 #include <asm/pgtable.h>
  26 
  27 #define MAX_SWAPFILES 8
  28 
  29 #define SWP_USED        1
  30 #define SWP_WRITEOK     3
  31 
  32 #define SWP_TYPE(entry) (((entry) >> 1) & 0x7f)
  33 #define SWP_OFFSET(entry) ((entry) >> 12)
  34 #define SWP_ENTRY(type,offset) (((type) << 1) | ((offset) << 12))
  35 
  36 int min_free_pages = 20;
  37 
  38 static int nr_swapfiles = 0;
  39 static struct wait_queue * lock_queue = NULL;
  40 
  41 static struct swap_info_struct {
  42         unsigned long flags;
  43         struct inode * swap_file;
  44         unsigned int swap_device;
  45         unsigned char * swap_map;
  46         unsigned char * swap_lockmap;
  47         int pages;
  48         int lowest_bit;
  49         int highest_bit;
  50         unsigned long max;
  51 } swap_info[MAX_SWAPFILES];
  52 
  53 extern int shm_swap (int);
  54 
  55 unsigned long *swap_cache;
  56 
  57 #ifdef SWAP_CACHE_INFO
  58 unsigned long swap_cache_add_total = 0;
  59 unsigned long swap_cache_add_success = 0;
  60 unsigned long swap_cache_del_total = 0;
  61 unsigned long swap_cache_del_success = 0;
  62 unsigned long swap_cache_find_total = 0;
  63 unsigned long swap_cache_find_success = 0;
  64 
  65 extern inline void show_swap_cache_info(void)
     /* [previous][next][first][last][top][bottom][index][help] */
  66 {
  67         printk("Swap cache: add %ld/%ld, delete %ld/%ld, find %ld/%ld\n",
  68                 swap_cache_add_total, swap_cache_add_success, 
  69                 swap_cache_del_total, swap_cache_del_success,
  70                 swap_cache_find_total, swap_cache_find_success);
  71 }
  72 #endif
  73 
  74 static int add_to_swap_cache(unsigned long addr, unsigned long entry)
     /* [previous][next][first][last][top][bottom][index][help] */
  75 {
  76         struct swap_info_struct * p = &swap_info[SWP_TYPE(entry)];
  77 
  78 #ifdef SWAP_CACHE_INFO
  79         swap_cache_add_total++;
  80 #endif
  81         if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
  82                 entry = (unsigned long) xchg_ptr(swap_cache + MAP_NR(addr), (void *) entry);
  83                 if (entry)  {
  84                         printk("swap_cache: replacing non-NULL entry\n");
  85                 }
  86 #ifdef SWAP_CACHE_INFO
  87                 swap_cache_add_success++;
  88 #endif
  89                 return 1;
  90         }
  91         return 0;
  92 }
  93 
  94 static unsigned long init_swap_cache(unsigned long mem_start,
     /* [previous][next][first][last][top][bottom][index][help] */
  95         unsigned long mem_end)
  96 {
  97         unsigned long swap_cache_size;
  98 
  99         mem_start = (mem_start + 15) & ~15;
 100         swap_cache = (unsigned long *) mem_start;
 101         swap_cache_size = MAP_NR(mem_end);
 102         memset(swap_cache, 0, swap_cache_size * sizeof (unsigned long));
 103         return (unsigned long) (swap_cache + swap_cache_size);
 104 }
 105 
 106 void rw_swap_page(int rw, unsigned long entry, char * buf)
     /* [previous][next][first][last][top][bottom][index][help] */
 107 {
 108         unsigned long type, offset;
 109         struct swap_info_struct * p;
 110 
 111         type = SWP_TYPE(entry);
 112         if (type >= nr_swapfiles) {
 113                 printk("Internal error: bad swap-device\n");
 114                 return;
 115         }
 116         p = &swap_info[type];
 117         offset = SWP_OFFSET(entry);
 118         if (offset >= p->max) {
 119                 printk("rw_swap_page: weirdness\n");
 120                 return;
 121         }
 122         if (p->swap_map && !p->swap_map[offset]) {
 123                 printk("Hmm.. Trying to use unallocated swap (%08lx)\n", entry);
 124                 return;
 125         }
 126         if (!(p->flags & SWP_USED)) {
 127                 printk("Trying to swap to unused swap-device\n");
 128                 return;
 129         }
 130         while (set_bit(offset,p->swap_lockmap))
 131                 sleep_on(&lock_queue);
 132         if (rw == READ)
 133                 kstat.pswpin++;
 134         else
 135                 kstat.pswpout++;
 136         if (p->swap_device) {
 137                 ll_rw_page(rw,p->swap_device,offset,buf);
 138         } else if (p->swap_file) {
 139                 struct inode *swapf = p->swap_file;
 140                 unsigned int zones[8];
 141                 int i;
 142                 if (swapf->i_op->bmap == NULL
 143                         && swapf->i_op->smap != NULL){
 144                         /*
 145                                 With MsDOS, we use msdos_smap which return
 146                                 a sector number (not a cluster or block number).
 147                                 It is a patch to enable the UMSDOS project.
 148                                 Other people are working on better solution.
 149 
 150                                 It sounds like ll_rw_swap_file defined
 151                                 it operation size (sector size) based on
 152                                 PAGE_SIZE and the number of block to read.
 153                                 So using bmap or smap should work even if
 154                                 smap will require more blocks.
 155                         */
 156                         int j;
 157                         unsigned int block = offset << 3;
 158 
 159                         for (i=0, j=0; j< PAGE_SIZE ; i++, j += 512){
 160                                 if (!(zones[i] = swapf->i_op->smap(swapf,block++))) {
 161                                         printk("rw_swap_page: bad swap file\n");
 162                                         return;
 163                                 }
 164                         }
 165                 }else{
 166                         int j;
 167                         unsigned int block = offset
 168                                 << (12 - swapf->i_sb->s_blocksize_bits);
 169 
 170                         for (i=0, j=0; j< PAGE_SIZE ; i++, j +=swapf->i_sb->s_blocksize)
 171                                 if (!(zones[i] = bmap(swapf,block++))) {
 172                                         printk("rw_swap_page: bad swap file\n");
 173                                         return;
 174                                 }
 175                 }
 176                 ll_rw_swap_file(rw,swapf->i_dev, zones, i,buf);
 177         } else
 178                 printk("re_swap_page: no swap file or device\n");
 179         if (offset && !clear_bit(offset,p->swap_lockmap))
 180                 printk("rw_swap_page: lock already cleared\n");
 181         wake_up(&lock_queue);
 182 }
 183 
 184 unsigned int get_swap_page(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 185 {
 186         struct swap_info_struct * p;
 187         unsigned int offset, type;
 188 
 189         p = swap_info;
 190         for (type = 0 ; type < nr_swapfiles ; type++,p++) {
 191                 if ((p->flags & SWP_WRITEOK) != SWP_WRITEOK)
 192                         continue;
 193                 for (offset = p->lowest_bit; offset <= p->highest_bit ; offset++) {
 194                         if (p->swap_map[offset])
 195                                 continue;
 196                         if (test_bit(offset, p->swap_lockmap))
 197                                 continue;
 198                         p->swap_map[offset] = 1;
 199                         nr_swap_pages--;
 200                         if (offset == p->highest_bit)
 201                                 p->highest_bit--;
 202                         p->lowest_bit = offset;
 203                         return SWP_ENTRY(type,offset);
 204                 }
 205         }
 206         return 0;
 207 }
 208 
 209 void swap_duplicate(unsigned long entry)
     /* [previous][next][first][last][top][bottom][index][help] */
 210 {
 211         struct swap_info_struct * p;
 212         unsigned long offset, type;
 213 
 214         if (!entry)
 215                 return;
 216         offset = SWP_OFFSET(entry);
 217         type = SWP_TYPE(entry);
 218         if (type == SHM_SWP_TYPE)
 219                 return;
 220         if (type >= nr_swapfiles) {
 221                 printk("Trying to duplicate nonexistent swap-page\n");
 222                 return;
 223         }
 224         p = type + swap_info;
 225         if (offset >= p->max) {
 226                 printk("swap_duplicate: weirdness\n");
 227                 return;
 228         }
 229         if (!p->swap_map[offset]) {
 230                 printk("swap_duplicate: trying to duplicate unused page\n");
 231                 return;
 232         }
 233         p->swap_map[offset]++;
 234         return;
 235 }
 236 
 237 void swap_free(unsigned long entry)
     /* [previous][next][first][last][top][bottom][index][help] */
 238 {
 239         struct swap_info_struct * p;
 240         unsigned long offset, type;
 241 
 242         if (!entry)
 243                 return;
 244         type = SWP_TYPE(entry);
 245         if (type == SHM_SWP_TYPE)
 246                 return;
 247         if (type >= nr_swapfiles) {
 248                 printk("Trying to free nonexistent swap-page\n");
 249                 return;
 250         }
 251         p = & swap_info[type];
 252         offset = SWP_OFFSET(entry);
 253         if (offset >= p->max) {
 254                 printk("swap_free: weirdness\n");
 255                 return;
 256         }
 257         if (!(p->flags & SWP_USED)) {
 258                 printk("Trying to free swap from unused swap-device\n");
 259                 return;
 260         }
 261         if (offset < p->lowest_bit)
 262                 p->lowest_bit = offset;
 263         if (offset > p->highest_bit)
 264                 p->highest_bit = offset;
 265         if (!p->swap_map[offset])
 266                 printk("swap_free: swap-space map bad (entry %08lx)\n",entry);
 267         else
 268                 if (!--p->swap_map[offset])
 269                         nr_swap_pages++;
 270 }
 271 
 272 /*
 273  * The tests may look silly, but it essentially makes sure that
 274  * no other process did a swap-in on us just as we were waiting.
 275  *
 276  * Also, don't bother to add to the swap cache if this page-in
 277  * was due to a write access.
 278  */
 279 void swap_in(struct vm_area_struct * vma, pte_t * page_table,
     /* [previous][next][first][last][top][bottom][index][help] */
 280         unsigned long entry, int write_access)
 281 {
 282         unsigned long page = get_free_page(GFP_KERNEL);
 283 
 284         if (pte_val(*page_table) != entry) {
 285                 free_page(page);
 286                 return;
 287         }
 288         if (!page) {
 289                 *page_table = BAD_PAGE;
 290                 swap_free(entry);
 291                 oom(current);
 292                 return;
 293         }
 294         read_swap_page(entry, (char *) page);
 295         if (pte_val(*page_table) != entry) {
 296                 free_page(page);
 297                 return;
 298         }
 299         vma->vm_task->mm->rss++;
 300         vma->vm_task->mm->maj_flt++;
 301         if (!write_access && add_to_swap_cache(page, entry)) {
 302                 *page_table = mk_pte(page, vma->vm_page_prot);
 303                 return;
 304         }
 305         *page_table = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 306         swap_free(entry);
 307         return;
 308 }
 309 
 310 /*
 311  * The swap-out functions return 1 of they successfully
 312  * threw something out, and we got a free page. It returns
 313  * zero if it couldn't do anything, and any other value
 314  * indicates it decreased rss, but the page was shared.
 315  *
 316  * NOTE! If it sleeps, it *must* return 1 to make sure we
 317  * don't continue with the swap-out. Otherwise we may be
 318  * using a process that no longer actually exists (it might
 319  * have died while we slept).
 320  */
 321 static inline int try_to_swap_out(struct vm_area_struct* vma, unsigned long address, pte_t * page_table)
     /* [previous][next][first][last][top][bottom][index][help] */
 322 {
 323         pte_t pte;
 324         unsigned long entry;
 325         unsigned long page;
 326 
 327         pte = *page_table;
 328         if (!pte_present(pte))
 329                 return 0;
 330         page = pte_page(pte);
 331         if (page >= high_memory)
 332                 return 0;
 333         if (mem_map[MAP_NR(page)] & MAP_PAGE_RESERVED)
 334                 return 0;
 335         if ((pte_dirty(pte) && delete_from_swap_cache(page)) || pte_young(pte))  {
 336                 *page_table = pte_mkold(pte);
 337                 return 0;
 338         }       
 339         if (pte_dirty(pte)) {
 340                 if (mem_map[MAP_NR(page)] != 1)
 341                         return 0;
 342                 if (vma->vm_ops && vma->vm_ops->swapout) {
 343                         vma->vm_task->mm->rss--;
 344                         vma->vm_ops->swapout(vma, address-vma->vm_start, page_table);
 345                 } else {
 346                         if (!(entry = get_swap_page()))
 347                                 return 0;
 348                         vma->vm_task->mm->rss--;
 349                         pte_val(*page_table) = entry;
 350                         invalidate();
 351                         write_swap_page(entry, (char *) page);
 352                 }
 353                 free_page(page);
 354                 return 1;       /* we slept: the process may not exist any more */
 355         }
 356         if ((entry = find_in_swap_cache(page)))  {
 357                 if (mem_map[MAP_NR(page)] != 1) {
 358                         *page_table = pte_mkdirty(pte);
 359                         printk("Aiee.. duplicated cached swap-cache entry\n");
 360                         return 0;
 361                 }
 362                 vma->vm_task->mm->rss--;
 363                 pte_val(*page_table) = entry;
 364                 invalidate();
 365                 free_page(page);
 366                 return 1;
 367         } 
 368         vma->vm_task->mm->rss--;
 369         pte_clear(page_table);
 370         invalidate();
 371         entry = mem_map[MAP_NR(page)];
 372         free_page(page);
 373         return entry;
 374 }
 375 
 376 /*
 377  * A new implementation of swap_out().  We do not swap complete processes,
 378  * but only a small number of blocks, before we continue with the next
 379  * process.  The number of blocks actually swapped is determined on the
 380  * number of page faults, that this process actually had in the last time,
 381  * so we won't swap heavily used processes all the time ...
 382  *
 383  * Note: the priority argument is a hint on much CPU to waste with the
 384  *       swap block search, not a hint, of how much blocks to swap with
 385  *       each process.
 386  *
 387  * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
 388  */
 389 
 390 /*
 391  * These are the minimum and maximum number of pages to swap from one process,
 392  * before proceeding to the next:
 393  */
 394 #define SWAP_MIN        4
 395 #define SWAP_MAX        32
 396 
 397 /*
 398  * The actual number of pages to swap is determined as:
 399  * SWAP_RATIO / (number of recent major page faults)
 400  */
 401 #define SWAP_RATIO      128
 402 
 403 static inline int swap_out_pmd(struct vm_area_struct * vma, pmd_t *dir,
     /* [previous][next][first][last][top][bottom][index][help] */
 404         unsigned long address, unsigned long end)
 405 {
 406         pte_t * pte;
 407         unsigned long pmd_end;
 408 
 409         if (pmd_none(*dir))
 410                 return 0;
 411         if (pmd_bad(*dir)) {
 412                 printk("swap_out_pmd: bad pmd (%08lx)\n", pmd_val(*dir));
 413                 pmd_clear(dir);
 414                 return 0;
 415         }
 416         
 417         pte = pte_offset(dir, address);
 418         
 419         pmd_end = (address + PMD_SIZE) & PMD_MASK;
 420         if (end > pmd_end)
 421                 end = pmd_end;
 422 
 423         do {
 424                 int result;
 425                 vma->vm_task->mm->swap_address = address + PAGE_SIZE;
 426                 result = try_to_swap_out(vma, address, pte);
 427                 if (result)
 428                         return result;
 429                 address += PAGE_SIZE;
 430                 pte++;
 431         } while (address < end);
 432         return 0;
 433 }
 434 
 435 static inline int swap_out_pgd(struct vm_area_struct * vma, pgd_t *dir,
     /* [previous][next][first][last][top][bottom][index][help] */
 436         unsigned long address, unsigned long end)
 437 {
 438         pmd_t * pmd;
 439         unsigned long pgd_end;
 440 
 441         if (pgd_none(*dir))
 442                 return 0;
 443         if (pgd_bad(*dir)) {
 444                 printk("swap_out_pgd: bad pgd (%08lx)\n", pgd_val(*dir));
 445                 pgd_clear(dir);
 446                 return 0;
 447         }
 448 
 449         pmd = pmd_offset(dir, address);
 450 
 451         pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;  
 452         if (end > pgd_end)
 453                 end = pgd_end;
 454         
 455         do {
 456                 int result = swap_out_pmd(vma, pmd, address, end);
 457                 if (result)
 458                         return result;
 459                 address = (address + PMD_SIZE) & PMD_MASK;
 460                 pmd++;
 461         } while (address < end);
 462         return 0;
 463 }
 464 
 465 static int swap_out_vma(struct vm_area_struct * vma, pgd_t *pgdir,
     /* [previous][next][first][last][top][bottom][index][help] */
 466         unsigned long start)
 467 {
 468         unsigned long end;
 469 
 470         /* Don't swap out areas like shared memory which have their
 471             own separate swapping mechanism. */
 472         if (vma->vm_flags & VM_SHM)
 473                 return 0;
 474 
 475         end = vma->vm_end;
 476         while (start < end) {
 477                 int result = swap_out_pgd(vma, pgdir, start, end);
 478                 if (result)
 479                         return result;
 480                 start = (start + PGDIR_SIZE) & PGDIR_MASK;
 481                 pgdir++;
 482         }
 483         return 0;
 484 }
 485 
 486 static int swap_out_process(struct task_struct * p)
     /* [previous][next][first][last][top][bottom][index][help] */
 487 {
 488         unsigned long address;
 489         struct vm_area_struct* vma;
 490 
 491         /*
 492          * Go through process' page directory.
 493          */
 494         address = p->mm->swap_address;
 495         p->mm->swap_address = 0;
 496 
 497         /*
 498          * Find the proper vm-area
 499          */
 500         vma = find_vma(p, address);
 501         if (!vma)
 502                 return 0;
 503         if (address < vma->vm_start)
 504                 address = vma->vm_start;
 505 
 506         for (;;) {
 507                 int result = swap_out_vma(vma, pgd_offset(p, address), address);
 508                 if (result)
 509                         return result;
 510                 vma = vma->vm_next;
 511                 if (!vma)
 512                         break;
 513                 address = vma->vm_start;
 514         }
 515         p->mm->swap_address = 0;
 516         return 0;
 517 }
 518 
 519 static int swap_out(unsigned int priority)
     /* [previous][next][first][last][top][bottom][index][help] */
 520 {
 521         static int swap_task;
 522         int loop, counter;
 523         struct task_struct *p;
 524 
 525         counter = 6*nr_tasks >> priority;
 526         for(; counter >= 0; counter--) {
 527                 /*
 528                  * Check that swap_task is suitable for swapping.  If not, look for
 529                  * the next suitable process.
 530                  */
 531                 loop = 0;
 532                 while(1) {
 533                         if (swap_task >= NR_TASKS) {
 534                                 swap_task = 1;
 535                                 if (loop)
 536                                         /* all processes are unswappable or already swapped out */
 537                                         return 0;
 538                                 loop = 1;
 539                         }
 540 
 541                         p = task[swap_task];
 542                         if (p && p->mm->swappable && p->mm->rss)
 543                                 break;
 544 
 545                         swap_task++;
 546                 }
 547 
 548                 /*
 549                  * Determine the number of pages to swap from this process.
 550                  */
 551                 if (!p->mm->swap_cnt) {
 552                         p->mm->dec_flt = (p->mm->dec_flt * 3) / 4 + p->mm->maj_flt - p->mm->old_maj_flt;
 553                         p->mm->old_maj_flt = p->mm->maj_flt;
 554 
 555                         if (p->mm->dec_flt >= SWAP_RATIO / SWAP_MIN) {
 556                                 p->mm->dec_flt = SWAP_RATIO / SWAP_MIN;
 557                                 p->mm->swap_cnt = SWAP_MIN;
 558                         } else if (p->mm->dec_flt <= SWAP_RATIO / SWAP_MAX)
 559                                 p->mm->swap_cnt = SWAP_MAX;
 560                         else
 561                                 p->mm->swap_cnt = SWAP_RATIO / p->mm->dec_flt;
 562                 }
 563                 if (!--p->mm->swap_cnt)
 564                         swap_task++;
 565                 switch (swap_out_process(p)) {
 566                         case 0:
 567                                 if (p->mm->swap_cnt)
 568                                         swap_task++;
 569                                 break;
 570                         case 1:
 571                                 return 1;
 572                         default:
 573                                 break;
 574                 }
 575         }
 576         return 0;
 577 }
 578 
 579 /*
 580  * we keep on shrinking one resource until it's considered "too hard",
 581  * and then switch to the next one (priority being an indication on how
 582  * hard we should try with the resource).
 583  *
 584  * This should automatically find the resource that can most easily be
 585  * free'd, so hopefully we'll get reasonable behaviour even under very
 586  * different circumstances.
 587  */
 588 static int try_to_free_page(int priority)
     /* [previous][next][first][last][top][bottom][index][help] */
 589 {
 590         static int state = 0;
 591         int i=6;
 592 
 593         switch (state) {
 594                 do {
 595                 case 0:
 596                         if (priority != GFP_NOBUFFER && shrink_buffers(i))
 597                                 return 1;
 598                         state = 1;
 599                 case 1:
 600                         if (shm_swap(i))
 601                                 return 1;
 602                         state = 2;
 603                 default:
 604                         if (swap_out(i))
 605                                 return 1;
 606                         state = 0;
 607                 } while(i--);
 608         }
 609         return 0;
 610 }
 611 
 612 static inline void add_mem_queue(struct mem_list * head, struct mem_list * entry)
     /* [previous][next][first][last][top][bottom][index][help] */
 613 {
 614         entry->prev = head;
 615         (entry->next = head->next)->prev = entry;
 616         head->next = entry;
 617 }
 618 
 619 static inline void remove_mem_queue(struct mem_list * head, struct mem_list * entry)
     /* [previous][next][first][last][top][bottom][index][help] */
 620 {
 621         entry->next->prev = entry->prev;
 622         entry->prev->next = entry->next;
 623 }
 624 
 625 /*
 626  * Free_page() adds the page to the free lists. This is optimized for
 627  * fast normal cases (no error jumps taken normally).
 628  *
 629  * The way to optimize jumps for gcc-2.2.2 is to:
 630  *  - select the "normal" case and put it inside the if () { XXX }
 631  *  - no else-statements if you can avoid them
 632  *
 633  * With the above two rules, you get a straight-line execution path
 634  * for the normal case, giving better asm-code.
 635  *
 636  * free_page() may sleep since the page being freed may be a buffer
 637  * page or present in the swap cache. It will not sleep, however,
 638  * for a freshly allocated page (get_free_page()).
 639  */
 640 
 641 /*
 642  * Buddy system. Hairy. You really aren't expected to understand this
 643  */
 644 static inline void free_pages_ok(unsigned long addr, unsigned long order)
     /* [previous][next][first][last][top][bottom][index][help] */
 645 {
 646         unsigned long index = MAP_NR(addr) >> (1 + order);
 647         unsigned long mask = PAGE_MASK << order;
 648 
 649         addr &= mask;
 650         nr_free_pages += 1 << order;
 651         while (order < NR_MEM_LISTS-1) {
 652                 if (!change_bit(index, free_area_map[order]))
 653                         break;
 654                 remove_mem_queue(free_area_list+order, (struct mem_list *) (addr ^ (1+~mask)));
 655                 order++;
 656                 index >>= 1;
 657                 mask <<= 1;
 658                 addr &= mask;
 659         }
 660         add_mem_queue(free_area_list+order, (struct mem_list *) addr);
 661 }
 662 
 663 static inline void check_free_buffers(unsigned long addr)
     /* [previous][next][first][last][top][bottom][index][help] */
 664 {
 665         struct buffer_head * bh;
 666 
 667         bh = buffer_pages[MAP_NR(addr)];
 668         if (bh) {
 669                 struct buffer_head *tmp = bh;
 670                 do {
 671                         if (tmp->b_list == BUF_SHARED && tmp->b_dev != 0xffff)
 672                                 refile_buffer(tmp);
 673                         tmp = tmp->b_this_page;
 674                 } while (tmp != bh);
 675         }
 676 }
 677 
 678 void free_pages(unsigned long addr, unsigned long order)
     /* [previous][next][first][last][top][bottom][index][help] */
 679 {
 680         if (addr < high_memory) {
 681                 unsigned long flag;
 682                 mem_map_t * map = mem_map + MAP_NR(addr);
 683                 if (*map) {
 684                         if (!(*map & MAP_PAGE_RESERVED)) {
 685                                 save_flags(flag);
 686                                 cli();
 687                                 if (!--*map)  {
 688                                         free_pages_ok(addr, order);
 689                                         delete_from_swap_cache(addr);
 690                                 }
 691                                 restore_flags(flag);
 692                                 if (*map == 1)
 693                                         check_free_buffers(addr);
 694                         }
 695                         return;
 696                 }
 697                 printk("Trying to free free memory (%08lx): memory probably corrupted\n",addr);
 698                 printk("PC = %p\n", __builtin_return_address(0));
 699                 return;
 700         }
 701 }
 702 
 703 /*
 704  * Some ugly macros to speed up __get_free_pages()..
 705  */
 706 #define RMQUEUE(order) \
 707 do { struct mem_list * queue = free_area_list+order; \
 708      unsigned long new_order = order; \
 709         do { struct mem_list *next = queue->next; \
 710                 if (queue != next) { \
 711                         (queue->next = next->next)->prev = queue; \
 712                         mark_used((unsigned long) next, new_order); \
 713                         nr_free_pages -= 1 << order; \
 714                         restore_flags(flags); \
 715                         EXPAND(next, order, new_order); \
 716                         return (unsigned long) next; \
 717                 } new_order++; queue++; \
 718         } while (new_order < NR_MEM_LISTS); \
 719 } while (0)
 720 
 721 static inline int mark_used(unsigned long addr, unsigned long order)
     /* [previous][next][first][last][top][bottom][index][help] */
 722 {
 723         return change_bit(MAP_NR(addr) >> (1+order), free_area_map[order]);
 724 }
 725 
 726 #define EXPAND(addr,low,high) \
 727 do { unsigned long size = PAGE_SIZE << high; \
 728         while (high > low) { \
 729                 high--; size >>= 1; cli(); \
 730                 add_mem_queue(free_area_list+high, addr); \
 731                 mark_used((unsigned long) addr, high); \
 732                 restore_flags(flags); \
 733                 addr = (struct mem_list *) (size + (unsigned long) addr); \
 734         } mem_map[MAP_NR((unsigned long) addr)] = 1; \
 735 } while (0)
 736 
 737 unsigned long __get_free_pages(int priority, unsigned long order)
     /* [previous][next][first][last][top][bottom][index][help] */
 738 {
 739         unsigned long flags;
 740         int reserved_pages;
 741 
 742         if (intr_count && priority != GFP_ATOMIC) {
 743                 static int count = 0;
 744                 if (++count < 5) {
 745                         printk("gfp called nonatomically from interrupt %p\n",
 746                                 __builtin_return_address(0));
 747                         priority = GFP_ATOMIC;
 748                 }
 749         }
 750         reserved_pages = 5;
 751         if (priority != GFP_NFS)
 752                 reserved_pages = min_free_pages;
 753         save_flags(flags);
 754 repeat:
 755         cli();
 756         if ((priority==GFP_ATOMIC) || nr_free_pages > reserved_pages) {
 757                 RMQUEUE(order);
 758                 restore_flags(flags);
 759                 return 0;
 760         }
 761         restore_flags(flags);
 762         if (priority != GFP_BUFFER && try_to_free_page(priority))
 763                 goto repeat;
 764         return 0;
 765 }
 766 
 767 /*
 768  * Yes, I know this is ugly. Don't tell me.
 769  */
 770 unsigned long __get_dma_pages(int priority, unsigned long order)
     /* [previous][next][first][last][top][bottom][index][help] */
 771 {
 772         unsigned long list = 0;
 773         unsigned long result;
 774         unsigned long limit = MAX_DMA_ADDRESS;
 775 
 776         /* if (EISA_bus) limit = ~0UL; */
 777         if (priority != GFP_ATOMIC)
 778                 priority = GFP_BUFFER;
 779         for (;;) {
 780                 result = __get_free_pages(priority, order);
 781                 if (result < limit) /* covers failure as well */
 782                         break;
 783                 *(unsigned long *) result = list;
 784                 list = result;
 785         }
 786         while (list) {
 787                 unsigned long tmp = list;
 788                 list = *(unsigned long *) list;
 789                 free_pages(tmp, order);
 790         }
 791         return result;
 792 }
 793 
 794 /*
 795  * Show free area list (used inside shift_scroll-lock stuff)
 796  * We also calculate the percentage fragmentation. We do this by counting the
 797  * memory on each free list with the exception of the first item on the list.
 798  */
 799 void show_free_areas(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 800 {
 801         unsigned long order, flags;
 802         unsigned long total = 0;
 803 
 804         printk("Free pages:      %6dkB\n ( ",nr_free_pages<<(PAGE_SHIFT-10));
 805         save_flags(flags);
 806         cli();
 807         for (order=0 ; order < NR_MEM_LISTS; order++) {
 808                 struct mem_list * tmp;
 809                 unsigned long nr = 0;
 810                 for (tmp = free_area_list[order].next ; tmp != free_area_list + order ; tmp = tmp->next) {
 811                         nr ++;
 812                 }
 813                 total += nr * ((PAGE_SIZE>>10) << order);
 814                 printk("%lu*%lukB ", nr, (PAGE_SIZE>>10) << order);
 815         }
 816         restore_flags(flags);
 817         printk("= %lukB)\n", total);
 818 #ifdef SWAP_CACHE_INFO
 819         show_swap_cache_info();
 820 #endif  
 821 }
 822 
 823 /*
 824  * Trying to stop swapping from a file is fraught with races, so
 825  * we repeat quite a bit here when we have to pause. swapoff()
 826  * isn't exactly timing-critical, so who cares (but this is /really/
 827  * inefficient, ugh).
 828  *
 829  * We return 1 after having slept, which makes the process start over
 830  * from the beginning for this process..
 831  */
 832 static inline int unuse_pte(struct vm_area_struct * vma, unsigned long address,
     /* [previous][next][first][last][top][bottom][index][help] */
 833         pte_t *dir, unsigned int type, unsigned long page)
 834 {
 835         pte_t pte = *dir;
 836 
 837         if (pte_none(pte))
 838                 return 0;
 839         if (pte_present(pte)) {
 840                 unsigned long page = pte_page(pte);
 841                 if (page >= high_memory)
 842                         return 0;
 843                 if (!in_swap_cache(page))
 844                         return 0;
 845                 if (SWP_TYPE(in_swap_cache(page)) != type)
 846                         return 0;
 847                 delete_from_swap_cache(page);
 848                 *dir = pte_mkdirty(pte);
 849                 return 0;
 850         }
 851         if (SWP_TYPE(pte_val(pte)) != type)
 852                 return 0;
 853         read_swap_page(pte_val(pte), (char *) page);
 854         if (pte_val(*dir) != pte_val(pte)) {
 855                 free_page(page);
 856                 return 1;
 857         }
 858         *dir = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 859         ++vma->vm_task->mm->rss;
 860         swap_free(pte_val(pte));
 861         return 1;
 862 }
 863 
 864 static inline int unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
     /* [previous][next][first][last][top][bottom][index][help] */
 865         unsigned long address, unsigned long size, unsigned long offset,
 866         unsigned int type, unsigned long page)
 867 {
 868         pte_t * pte;
 869         unsigned long end;
 870 
 871         if (pmd_none(*dir))
 872                 return 0;
 873         if (pmd_bad(*dir)) {
 874                 printk("unuse_pmd: bad pmd (%08lx)\n", pmd_val(*dir));
 875                 pmd_clear(dir);
 876                 return 0;
 877         }
 878         pte = pte_offset(dir, address);
 879         offset += address & PMD_MASK;
 880         address &= ~PMD_MASK;
 881         end = address + size;
 882         if (end > PMD_SIZE)
 883                 end = PMD_SIZE;
 884         do {
 885                 if (unuse_pte(vma, offset+address-vma->vm_start, pte, type, page))
 886                         return 1;
 887                 address += PAGE_SIZE;
 888                 pte++;
 889         } while (address < end);
 890         return 0;
 891 }
 892 
 893 static inline int unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
     /* [previous][next][first][last][top][bottom][index][help] */
 894         unsigned long address, unsigned long size,
 895         unsigned int type, unsigned long page)
 896 {
 897         pmd_t * pmd;
 898         unsigned long offset, end;
 899 
 900         if (pgd_none(*dir))
 901                 return 0;
 902         if (pgd_bad(*dir)) {
 903                 printk("unuse_pgd: bad pgd (%08lx)\n", pgd_val(*dir));
 904                 pgd_clear(dir);
 905                 return 0;
 906         }
 907         pmd = pmd_offset(dir, address);
 908         offset = address & PGDIR_MASK;
 909         address &= ~PGDIR_MASK;
 910         end = address + size;
 911         if (end > PGDIR_SIZE)
 912                 end = PGDIR_SIZE;
 913         do {
 914                 if (unuse_pmd(vma, pmd, address, end - address, offset, type, page))
 915                         return 1;
 916                 address = (address + PMD_SIZE) & PMD_MASK;
 917                 pmd++;
 918         } while (address < end);
 919         return 0;
 920 }
 921 
 922 static int unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir,
     /* [previous][next][first][last][top][bottom][index][help] */
 923         unsigned long start, unsigned long end,
 924         unsigned int type, unsigned long page)
 925 {
 926         while (start < end) {
 927                 if (unuse_pgd(vma, pgdir, start, end - start, type, page))
 928                         return 1;
 929                 start = (start + PGDIR_SIZE) & PGDIR_MASK;
 930                 pgdir++;
 931         }
 932         return 0;
 933 }
 934 
 935 static int unuse_process(struct task_struct * p, unsigned int type, unsigned long page)
     /* [previous][next][first][last][top][bottom][index][help] */
 936 {
 937         struct vm_area_struct* vma;
 938 
 939         /*
 940          * Go through process' page directory.
 941          */
 942         vma = p->mm->mmap;
 943         while (vma) {
 944                 pgd_t * pgd = pgd_offset(p, vma->vm_start);
 945                 if (unuse_vma(vma, pgd, vma->vm_start, vma->vm_end, type, page))
 946                         return 1;
 947                 vma = vma->vm_next;
 948         }
 949         return 0;
 950 }
 951 
 952 /*
 953  * To avoid races, we repeat for each process after having
 954  * swapped something in. That gets rid of a few pesky races,
 955  * and "swapoff" isn't exactly timing critical.
 956  */
 957 static int try_to_unuse(unsigned int type)
     /* [previous][next][first][last][top][bottom][index][help] */
 958 {
 959         int nr;
 960         unsigned long page = get_free_page(GFP_KERNEL);
 961 
 962         if (!page)
 963                 return -ENOMEM;
 964         nr = 0;
 965         while (nr < NR_TASKS) {
 966                 if (task[nr]) {
 967                         if (unuse_process(task[nr], type, page)) {
 968                                 page = get_free_page(GFP_KERNEL);
 969                                 if (!page)
 970                                         return -ENOMEM;
 971                                 continue;
 972                         }
 973                 }
 974                 nr++;
 975         }
 976         free_page(page);
 977         return 0;
 978 }
 979 
 980 asmlinkage int sys_swapoff(const char * specialfile)
     /* [previous][next][first][last][top][bottom][index][help] */
 981 {
 982         struct swap_info_struct * p;
 983         struct inode * inode;
 984         unsigned int type;
 985         struct file filp;
 986         int i;
 987 
 988         if (!suser())
 989                 return -EPERM;
 990         i = namei(specialfile,&inode);
 991         if (i)
 992                 return i;
 993         p = swap_info;
 994         for (type = 0 ; type < nr_swapfiles ; type++,p++) {
 995                 if ((p->flags & SWP_WRITEOK) != SWP_WRITEOK)
 996                         continue;
 997                 if (p->swap_file) {
 998                         if (p->swap_file == inode)
 999                                 break;
1000                 } else {
1001                         if (!S_ISBLK(inode->i_mode))
1002                                 continue;
1003                         if (p->swap_device == inode->i_rdev)
1004                                 break;
1005                 }
1006         }
1007 
1008         if (type >= nr_swapfiles){
1009                 iput(inode);
1010                 return -EINVAL;
1011         }
1012         p->flags = SWP_USED;
1013         i = try_to_unuse(type);
1014         if (i) {
1015                 iput(inode);
1016                 p->flags = SWP_WRITEOK;
1017                 return i;
1018         }
1019 
1020         if(p->swap_device){
1021                 memset(&filp, 0, sizeof(filp));         
1022                 filp.f_inode = inode;
1023                 filp.f_mode = 3; /* read write */
1024                 /* open it again to get fops */
1025                 if( !blkdev_open(inode, &filp) &&
1026                    filp.f_op && filp.f_op->release){
1027                         filp.f_op->release(inode,&filp);
1028                         filp.f_op->release(inode,&filp);
1029                 }
1030         }
1031         iput(inode);
1032 
1033         nr_swap_pages -= p->pages;
1034         iput(p->swap_file);
1035         p->swap_file = NULL;
1036         p->swap_device = 0;
1037         vfree(p->swap_map);
1038         p->swap_map = NULL;
1039         free_page((long) p->swap_lockmap);
1040         p->swap_lockmap = NULL;
1041         p->flags = 0;
1042         return 0;
1043 }
1044 
1045 /*
1046  * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
1047  *
1048  * The swapon system call
1049  */
1050 asmlinkage int sys_swapon(const char * specialfile)
     /* [previous][next][first][last][top][bottom][index][help] */
1051 {
1052         struct swap_info_struct * p;
1053         struct inode * swap_inode;
1054         unsigned int type;
1055         int i,j;
1056         int error;
1057         struct file filp;
1058 
1059         memset(&filp, 0, sizeof(filp));
1060         if (!suser())
1061                 return -EPERM;
1062         p = swap_info;
1063         for (type = 0 ; type < nr_swapfiles ; type++,p++)
1064                 if (!(p->flags & SWP_USED))
1065                         break;
1066         if (type >= MAX_SWAPFILES)
1067                 return -EPERM;
1068         if (type >= nr_swapfiles)
1069                 nr_swapfiles = type+1;
1070         p->flags = SWP_USED;
1071         p->swap_file = NULL;
1072         p->swap_device = 0;
1073         p->swap_map = NULL;
1074         p->swap_lockmap = NULL;
1075         p->lowest_bit = 0;
1076         p->highest_bit = 0;
1077         p->max = 1;
1078         error = namei(specialfile,&swap_inode);
1079         if (error)
1080                 goto bad_swap_2;
1081         p->swap_file = swap_inode;
1082         error = -EBUSY;
1083         if (swap_inode->i_count != 1)
1084                 goto bad_swap_2;
1085         error = -EINVAL;
1086 
1087         if (S_ISBLK(swap_inode->i_mode)) {
1088                 p->swap_device = swap_inode->i_rdev;
1089 
1090                 filp.f_inode = swap_inode;
1091                 filp.f_mode = 3; /* read write */
1092                 error = blkdev_open(swap_inode, &filp);
1093                 p->swap_file = NULL;
1094                 iput(swap_inode);
1095                 if(error)
1096                         goto bad_swap_2;
1097                 error = -ENODEV;
1098                 if (!p->swap_device)
1099                         goto bad_swap;
1100                 error = -EBUSY;
1101                 for (i = 0 ; i < nr_swapfiles ; i++) {
1102                         if (i == type)
1103                                 continue;
1104                         if (p->swap_device == swap_info[i].swap_device)
1105                                 goto bad_swap;
1106                 }
1107         } else if (!S_ISREG(swap_inode->i_mode))
1108                 goto bad_swap;
1109         p->swap_lockmap = (unsigned char *) get_free_page(GFP_USER);
1110         if (!p->swap_lockmap) {
1111                 printk("Unable to start swapping: out of memory :-)\n");
1112                 error = -ENOMEM;
1113                 goto bad_swap;
1114         }
1115         read_swap_page(SWP_ENTRY(type,0), (char *) p->swap_lockmap);
1116         if (memcmp("SWAP-SPACE",p->swap_lockmap+4086,10)) {
1117                 printk("Unable to find swap-space signature\n");
1118                 error = -EINVAL;
1119                 goto bad_swap;
1120         }
1121         memset(p->swap_lockmap+PAGE_SIZE-10,0,10);
1122         j = 0;
1123         p->lowest_bit = 0;
1124         p->highest_bit = 0;
1125         for (i = 1 ; i < 8*PAGE_SIZE ; i++) {
1126                 if (test_bit(i,p->swap_lockmap)) {
1127                         if (!p->lowest_bit)
1128                                 p->lowest_bit = i;
1129                         p->highest_bit = i;
1130                         p->max = i+1;
1131                         j++;
1132                 }
1133         }
1134         if (!j) {
1135                 printk("Empty swap-file\n");
1136                 error = -EINVAL;
1137                 goto bad_swap;
1138         }
1139         p->swap_map = (unsigned char *) vmalloc(p->max);
1140         if (!p->swap_map) {
1141                 error = -ENOMEM;
1142                 goto bad_swap;
1143         }
1144         for (i = 1 ; i < p->max ; i++) {
1145                 if (test_bit(i,p->swap_lockmap))
1146                         p->swap_map[i] = 0;
1147                 else
1148                         p->swap_map[i] = 0x80;
1149         }
1150         p->swap_map[0] = 0x80;
1151         memset(p->swap_lockmap,0,PAGE_SIZE);
1152         p->flags = SWP_WRITEOK;
1153         p->pages = j;
1154         nr_swap_pages += j;
1155         printk("Adding Swap: %dk swap-space\n",j<<2);
1156         return 0;
1157 bad_swap:
1158         if(filp.f_op && filp.f_op->release)
1159                 filp.f_op->release(filp.f_inode,&filp);
1160 bad_swap_2:
1161         free_page((long) p->swap_lockmap);
1162         vfree(p->swap_map);
1163         iput(p->swap_file);
1164         p->swap_device = 0;
1165         p->swap_file = NULL;
1166         p->swap_map = NULL;
1167         p->swap_lockmap = NULL;
1168         p->flags = 0;
1169         return error;
1170 }
1171 
1172 void si_swapinfo(struct sysinfo *val)
     /* [previous][next][first][last][top][bottom][index][help] */
1173 {
1174         unsigned int i, j;
1175 
1176         val->freeswap = val->totalswap = 0;
1177         for (i = 0; i < nr_swapfiles; i++) {
1178                 if ((swap_info[i].flags & SWP_WRITEOK) != SWP_WRITEOK)
1179                         continue;
1180                 for (j = 0; j < swap_info[i].max; ++j)
1181                         switch (swap_info[i].swap_map[j]) {
1182                                 case 128:
1183                                         continue;
1184                                 case 0:
1185                                         ++val->freeswap;
1186                                 default:
1187                                         ++val->totalswap;
1188                         }
1189         }
1190         val->freeswap <<= PAGE_SHIFT;
1191         val->totalswap <<= PAGE_SHIFT;
1192         return;
1193 }
1194 
1195 /*
1196  * set up the free-area data structures:
1197  *   - mark all pages MAP_PAGE_RESERVED
1198  *   - mark all memory queues empty
1199  *   - clear the memory bitmaps
1200  */
1201 unsigned long free_area_init(unsigned long start_mem, unsigned long end_mem)
     /* [previous][next][first][last][top][bottom][index][help] */
1202 {
1203         mem_map_t * p;
1204         unsigned long mask = PAGE_MASK;
1205         int i;
1206 
1207         /*
1208          * select nr of pages we try to keep free for important stuff
1209          * with a minimum of 16 pages. This is totally arbitrary
1210          */
1211         i = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT+6);
1212         if (i < 16)
1213                 i = 16;
1214         min_free_pages = i;
1215         start_mem = init_swap_cache(start_mem, end_mem);
1216         mem_map = (mem_map_t *) start_mem;
1217         p = mem_map + MAP_NR(end_mem);
1218         start_mem = (unsigned long) p;
1219         while (p > mem_map)
1220                 *--p = MAP_PAGE_RESERVED;
1221 
1222         for (i = 0 ; i < NR_MEM_LISTS ; i++) {
1223                 unsigned long bitmap_size;
1224                 free_area_list[i].prev = free_area_list[i].next = &free_area_list[i];
1225                 mask += mask;
1226                 end_mem = (end_mem + ~mask) & mask;
1227                 bitmap_size = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT + i);
1228                 bitmap_size = (bitmap_size + 7) >> 3;
1229                 bitmap_size = (bitmap_size + sizeof(unsigned long) - 1) & ~(sizeof(unsigned long)-1);
1230                 free_area_map[i] = (unsigned char *) start_mem;
1231                 memset((void *) start_mem, 0, bitmap_size);
1232                 start_mem += bitmap_size;
1233         }
1234         return start_mem;
1235 }

/* [previous][next][first][last][top][bottom][index][help] */