root/mm/swap.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. show_swap_cache_info
  2. add_to_swap_cache
  3. init_swap_cache
  4. rw_swap_page
  5. get_swap_page
  6. swap_duplicate
  7. swap_free
  8. swap_in
  9. try_to_swap_out
  10. swap_out_pmd
  11. swap_out_pgd
  12. swap_out_vma
  13. swap_out_process
  14. swap_out
  15. try_to_free_page
  16. add_mem_queue
  17. remove_mem_queue
  18. free_pages_ok
  19. check_free_buffers
  20. free_pages
  21. mark_used
  22. __get_free_pages
  23. __get_dma_pages
  24. show_free_areas
  25. unuse_pte
  26. unuse_pmd
  27. unuse_pgd
  28. unuse_vma
  29. unuse_process
  30. try_to_unuse
  31. sys_swapoff
  32. sys_swapon
  33. si_swapinfo
  34. free_area_init

   1 #define THREE_LEVEL
   2 /*
   3  *  linux/mm/swap.c
   4  *
   5  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   6  */
   7 
   8 /*
   9  * This file should contain most things doing the swapping from/to disk.
  10  * Started 18.12.91
  11  */
  12 
  13 #include <linux/mm.h>
  14 #include <linux/sched.h>
  15 #include <linux/head.h>
  16 #include <linux/kernel.h>
  17 #include <linux/kernel_stat.h>
  18 #include <linux/errno.h>
  19 #include <linux/string.h>
  20 #include <linux/stat.h>
  21 #include <linux/fs.h>
  22 
  23 #include <asm/dma.h>
  24 #include <asm/system.h> /* for cli()/sti() */
  25 #include <asm/bitops.h>
  26 #include <asm/pgtable.h>
  27 
  28 #define MAX_SWAPFILES 8
  29 
  30 #define SWP_USED        1
  31 #define SWP_WRITEOK     3
  32 
  33 #define SWP_TYPE(entry) (((entry) >> 1) & 0x7f)
  34 #define SWP_OFFSET(entry) ((entry) >> 12)
  35 #define SWP_ENTRY(type,offset) (((type) << 1) | ((offset) << 12))
  36 
  37 int min_free_pages = 20;
  38 
  39 static int nr_swapfiles = 0;
  40 static struct wait_queue * lock_queue = NULL;
  41 
  42 static struct swap_info_struct {
  43         unsigned long flags;
  44         struct inode * swap_file;
  45         unsigned int swap_device;
  46         unsigned char * swap_map;
  47         unsigned char * swap_lockmap;
  48         int pages;
  49         int lowest_bit;
  50         int highest_bit;
  51         unsigned long max;
  52 } swap_info[MAX_SWAPFILES];
  53 
  54 extern int shm_swap (int);
  55 
  56 unsigned long *swap_cache;
  57 
  58 #ifdef SWAP_CACHE_INFO
  59 unsigned long swap_cache_add_total = 0;
  60 unsigned long swap_cache_add_success = 0;
  61 unsigned long swap_cache_del_total = 0;
  62 unsigned long swap_cache_del_success = 0;
  63 unsigned long swap_cache_find_total = 0;
  64 unsigned long swap_cache_find_success = 0;
  65 
  66 extern inline void show_swap_cache_info(void)
     /* [previous][next][first][last][top][bottom][index][help] */
  67 {
  68         printk("Swap cache: add %ld/%ld, delete %ld/%ld, find %ld/%ld\n",
  69                 swap_cache_add_total, swap_cache_add_success, 
  70                 swap_cache_del_total, swap_cache_del_success,
  71                 swap_cache_find_total, swap_cache_find_success);
  72 }
  73 #endif
  74 
  75 static int add_to_swap_cache(unsigned long addr, unsigned long entry)
     /* [previous][next][first][last][top][bottom][index][help] */
  76 {
  77         struct swap_info_struct * p = &swap_info[SWP_TYPE(entry)];
  78 
  79 #ifdef SWAP_CACHE_INFO
  80         swap_cache_add_total++;
  81 #endif
  82         if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
  83                 entry = (unsigned long) xchg_ptr(swap_cache + MAP_NR(addr), (void *) entry);
  84                 if (entry)  {
  85                         printk("swap_cache: replacing non-NULL entry\n");
  86                 }
  87 #ifdef SWAP_CACHE_INFO
  88                 swap_cache_add_success++;
  89 #endif
  90                 return 1;
  91         }
  92         return 0;
  93 }
  94 
  95 static unsigned long init_swap_cache(unsigned long mem_start,
     /* [previous][next][first][last][top][bottom][index][help] */
  96         unsigned long mem_end)
  97 {
  98         unsigned long swap_cache_size;
  99 
 100         mem_start = (mem_start + 15) & ~15;
 101         swap_cache = (unsigned long *) mem_start;
 102         swap_cache_size = MAP_NR(mem_end);
 103         memset(swap_cache, 0, swap_cache_size * sizeof (unsigned long));
 104         return (unsigned long) (swap_cache + swap_cache_size);
 105 }
 106 
 107 void rw_swap_page(int rw, unsigned long entry, char * buf)
     /* [previous][next][first][last][top][bottom][index][help] */
 108 {
 109         unsigned long type, offset;
 110         struct swap_info_struct * p;
 111 
 112         type = SWP_TYPE(entry);
 113         if (type >= nr_swapfiles) {
 114                 printk("Internal error: bad swap-device\n");
 115                 return;
 116         }
 117         p = &swap_info[type];
 118         offset = SWP_OFFSET(entry);
 119         if (offset >= p->max) {
 120                 printk("rw_swap_page: weirdness\n");
 121                 return;
 122         }
 123         if (p->swap_map && !p->swap_map[offset]) {
 124                 printk("Hmm.. Trying to use unallocated swap (%08lx)\n", entry);
 125                 return;
 126         }
 127         if (!(p->flags & SWP_USED)) {
 128                 printk("Trying to swap to unused swap-device\n");
 129                 return;
 130         }
 131         while (set_bit(offset,p->swap_lockmap))
 132                 sleep_on(&lock_queue);
 133         if (rw == READ)
 134                 kstat.pswpin++;
 135         else
 136                 kstat.pswpout++;
 137         if (p->swap_device) {
 138                 ll_rw_page(rw,p->swap_device,offset,buf);
 139         } else if (p->swap_file) {
 140                 struct inode *swapf = p->swap_file;
 141                 unsigned int zones[8];
 142                 int i;
 143                 if (swapf->i_op->bmap == NULL
 144                         && swapf->i_op->smap != NULL){
 145                         /*
 146                                 With MsDOS, we use msdos_smap which return
 147                                 a sector number (not a cluster or block number).
 148                                 It is a patch to enable the UMSDOS project.
 149                                 Other people are working on better solution.
 150 
 151                                 It sounds like ll_rw_swap_file defined
 152                                 it operation size (sector size) based on
 153                                 PAGE_SIZE and the number of block to read.
 154                                 So using bmap or smap should work even if
 155                                 smap will require more blocks.
 156                         */
 157                         int j;
 158                         unsigned int block = offset << 3;
 159 
 160                         for (i=0, j=0; j< PAGE_SIZE ; i++, j += 512){
 161                                 if (!(zones[i] = swapf->i_op->smap(swapf,block++))) {
 162                                         printk("rw_swap_page: bad swap file\n");
 163                                         return;
 164                                 }
 165                         }
 166                 }else{
 167                         int j;
 168                         unsigned int block = offset
 169                                 << (12 - swapf->i_sb->s_blocksize_bits);
 170 
 171                         for (i=0, j=0; j< PAGE_SIZE ; i++, j +=swapf->i_sb->s_blocksize)
 172                                 if (!(zones[i] = bmap(swapf,block++))) {
 173                                         printk("rw_swap_page: bad swap file\n");
 174                                         return;
 175                                 }
 176                 }
 177                 ll_rw_swap_file(rw,swapf->i_dev, zones, i,buf);
 178         } else
 179                 printk("re_swap_page: no swap file or device\n");
 180         if (offset && !clear_bit(offset,p->swap_lockmap))
 181                 printk("rw_swap_page: lock already cleared\n");
 182         wake_up(&lock_queue);
 183 }
 184 
 185 unsigned int get_swap_page(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 186 {
 187         struct swap_info_struct * p;
 188         unsigned int offset, type;
 189 
 190         p = swap_info;
 191         for (type = 0 ; type < nr_swapfiles ; type++,p++) {
 192                 if ((p->flags & SWP_WRITEOK) != SWP_WRITEOK)
 193                         continue;
 194                 for (offset = p->lowest_bit; offset <= p->highest_bit ; offset++) {
 195                         if (p->swap_map[offset])
 196                                 continue;
 197                         p->swap_map[offset] = 1;
 198                         nr_swap_pages--;
 199                         if (offset == p->highest_bit)
 200                                 p->highest_bit--;
 201                         p->lowest_bit = offset;
 202                         return SWP_ENTRY(type,offset);
 203                 }
 204         }
 205         return 0;
 206 }
 207 
 208 void swap_duplicate(unsigned long entry)
     /* [previous][next][first][last][top][bottom][index][help] */
 209 {
 210         struct swap_info_struct * p;
 211         unsigned long offset, type;
 212 
 213         if (!entry)
 214                 return;
 215         offset = SWP_OFFSET(entry);
 216         type = SWP_TYPE(entry);
 217         if (type == SHM_SWP_TYPE)
 218                 return;
 219         if (type >= nr_swapfiles) {
 220                 printk("Trying to duplicate nonexistent swap-page\n");
 221                 return;
 222         }
 223         p = type + swap_info;
 224         if (offset >= p->max) {
 225                 printk("swap_duplicate: weirdness\n");
 226                 return;
 227         }
 228         if (!p->swap_map[offset]) {
 229                 printk("swap_duplicate: trying to duplicate unused page\n");
 230                 return;
 231         }
 232         p->swap_map[offset]++;
 233         return;
 234 }
 235 
 236 void swap_free(unsigned long entry)
     /* [previous][next][first][last][top][bottom][index][help] */
 237 {
 238         struct swap_info_struct * p;
 239         unsigned long offset, type;
 240 
 241         if (!entry)
 242                 return;
 243         type = SWP_TYPE(entry);
 244         if (type == SHM_SWP_TYPE)
 245                 return;
 246         if (type >= nr_swapfiles) {
 247                 printk("Trying to free nonexistent swap-page\n");
 248                 return;
 249         }
 250         p = & swap_info[type];
 251         offset = SWP_OFFSET(entry);
 252         if (offset >= p->max) {
 253                 printk("swap_free: weirdness\n");
 254                 return;
 255         }
 256         if (!(p->flags & SWP_USED)) {
 257                 printk("Trying to free swap from unused swap-device\n");
 258                 return;
 259         }
 260         while (set_bit(offset,p->swap_lockmap))
 261                 sleep_on(&lock_queue);
 262         if (offset < p->lowest_bit)
 263                 p->lowest_bit = offset;
 264         if (offset > p->highest_bit)
 265                 p->highest_bit = offset;
 266         if (!p->swap_map[offset])
 267                 printk("swap_free: swap-space map bad (entry %08lx)\n",entry);
 268         else
 269                 if (!--p->swap_map[offset])
 270                         nr_swap_pages++;
 271         if (!clear_bit(offset,p->swap_lockmap))
 272                 printk("swap_free: lock already cleared\n");
 273         wake_up(&lock_queue);
 274 }
 275 
 276 /*
 277  * The tests may look silly, but it essentially makes sure that
 278  * no other process did a swap-in on us just as we were waiting.
 279  *
 280  * Also, don't bother to add to the swap cache if this page-in
 281  * was due to a write access.
 282  */
 283 void swap_in(struct vm_area_struct * vma, pte_t * page_table,
     /* [previous][next][first][last][top][bottom][index][help] */
 284         unsigned long entry, int write_access)
 285 {
 286         unsigned long page = get_free_page(GFP_KERNEL);
 287 
 288         if (pte_val(*page_table) != entry) {
 289                 free_page(page);
 290                 return;
 291         }
 292         if (!page) {
 293                 *page_table = BAD_PAGE;
 294                 swap_free(entry);
 295                 oom(current);
 296                 return;
 297         }
 298         read_swap_page(entry, (char *) page);
 299         if (pte_val(*page_table) != entry) {
 300                 free_page(page);
 301                 return;
 302         }
 303         vma->vm_task->mm->rss++;
 304         vma->vm_task->mm->maj_flt++;
 305         if (!write_access && add_to_swap_cache(page, entry)) {
 306                 *page_table = mk_pte(page, vma->vm_page_prot);
 307                 return;
 308         }
 309         *page_table = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 310         swap_free(entry);
 311         return;
 312 }
 313 
 314 static inline int try_to_swap_out(struct vm_area_struct* vma, unsigned offset, pte_t * page_table)
     /* [previous][next][first][last][top][bottom][index][help] */
 315 {
 316         pte_t pte;
 317         unsigned long entry;
 318         unsigned long page;
 319 
 320         pte = *page_table;
 321         if (!pte_present(pte))
 322                 return 0;
 323         page = pte_page(pte);
 324         if (page >= high_memory)
 325                 return 0;
 326         if (mem_map[MAP_NR(page)] & MAP_PAGE_RESERVED)
 327                 return 0;
 328         if ((pte_dirty(pte) && delete_from_swap_cache(page)) || pte_young(pte))  {
 329                 *page_table = pte_mkold(pte);
 330                 return 0;
 331         }       
 332         if (pte_dirty(pte)) {
 333                 if (mem_map[MAP_NR(page)] != 1)
 334                         return 0;
 335                 if (vma->vm_ops && vma->vm_ops->swapout)
 336                         vma->vm_ops->swapout(vma, offset, page_table);
 337                 else {
 338                         if (!(entry = get_swap_page()))
 339                                 return 0;
 340                         pte_val(*page_table) = entry;
 341                         invalidate();
 342                         write_swap_page(entry, (char *) page);
 343                 }
 344                 free_page(page);
 345                 return 1 + mem_map[MAP_NR(page)];
 346         }
 347         if ((entry = find_in_swap_cache(page)))  {
 348                 if (mem_map[MAP_NR(page)] != 1) {
 349                         *page_table = pte_mkdirty(pte);
 350                         printk("Aiee.. duplicated cached swap-cache entry\n");
 351                         return 0;
 352                 }
 353                 pte_val(*page_table) = entry;
 354                 invalidate();
 355                 free_page(page);
 356                 return 1;
 357         } 
 358         pte_clear(page_table);
 359         invalidate();
 360         free_page(page);
 361         return 1 + mem_map[MAP_NR(page)];
 362 }
 363 
 364 /*
 365  * A new implementation of swap_out().  We do not swap complete processes,
 366  * but only a small number of blocks, before we continue with the next
 367  * process.  The number of blocks actually swapped is determined on the
 368  * number of page faults, that this process actually had in the last time,
 369  * so we won't swap heavily used processes all the time ...
 370  *
 371  * Note: the priority argument is a hint on much CPU to waste with the
 372  *       swap block search, not a hint, of how much blocks to swap with
 373  *       each process.
 374  *
 375  * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
 376  */
 377 
 378 /*
 379  * These are the minimum and maximum number of pages to swap from one process,
 380  * before proceeding to the next:
 381  */
 382 #define SWAP_MIN        4
 383 #define SWAP_MAX        32
 384 
 385 /*
 386  * The actual number of pages to swap is determined as:
 387  * SWAP_RATIO / (number of recent major page faults)
 388  */
 389 #define SWAP_RATIO      128
 390 
 391 static inline int swap_out_pmd(struct vm_area_struct * vma, pmd_t *dir,
     /* [previous][next][first][last][top][bottom][index][help] */
 392         unsigned long address, unsigned long end)
 393 {
 394         pte_t * pte;
 395         unsigned long pmd_end;
 396 
 397         if (pmd_none(*dir))
 398                 return 0;
 399         if (pmd_bad(*dir)) {
 400                 printk("swap_out_pmd: bad pmd (%08lx)\n", pmd_val(*dir));
 401                 pmd_clear(dir);
 402                 return 0;
 403         }
 404         
 405         pte = pte_offset(dir, address);
 406         
 407         pmd_end = (address + PMD_SIZE) & PMD_MASK;
 408         if (end > pmd_end)
 409                 end = pmd_end;
 410 
 411         do {
 412                 switch (try_to_swap_out(vma, address-vma->vm_start, pte)) {
 413                         case 0:
 414                                 break;
 415 
 416                         case 1:
 417                                 vma->vm_task->mm->rss--;
 418                                 /* continue with the following page the next time */
 419                                 vma->vm_task->mm->swap_address = address + PAGE_SIZE;
 420                                 return 1;
 421 
 422                         default:
 423                                 vma->vm_task->mm->rss--;
 424                                 break;
 425                 }
 426                 address += PAGE_SIZE;
 427                 pte++;
 428         } while (address < end);
 429         return 0;
 430 }
 431 
 432 static inline int swap_out_pgd(struct vm_area_struct * vma, pgd_t *dir,
     /* [previous][next][first][last][top][bottom][index][help] */
 433         unsigned long address, unsigned long end)
 434 {
 435         pmd_t * pmd;
 436         unsigned long pgd_end;
 437 
 438         if (pgd_none(*dir))
 439                 return 0;
 440         if (pgd_bad(*dir)) {
 441                 printk("swap_out_pgd: bad pgd (%08lx)\n", pgd_val(*dir));
 442                 pgd_clear(dir);
 443                 return 0;
 444         }
 445 
 446         pmd = pmd_offset(dir, address);
 447 
 448         pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;  
 449         if (end > pgd_end)
 450                 end = pgd_end;
 451         
 452         do {
 453                 if (swap_out_pmd(vma, pmd, address, end))
 454                         return 1;
 455                 address = (address + PMD_SIZE) & PMD_MASK;
 456                 pmd++;
 457         } while (address < end);
 458         return 0;
 459 }
 460 
 461 static int swap_out_vma(struct vm_area_struct * vma, pgd_t *pgdir,
     /* [previous][next][first][last][top][bottom][index][help] */
 462         unsigned long start)
 463 {
 464         unsigned long end;
 465 
 466         end = vma->vm_end;
 467         while (start < end) {
 468                 if (swap_out_pgd(vma, pgdir, start, end))
 469                         return 1;
 470                 start = (start + PGDIR_SIZE) & PGDIR_MASK;
 471                 pgdir++;
 472         }
 473         return 0;
 474 }
 475 
 476 static int swap_out_process(struct task_struct * p)
     /* [previous][next][first][last][top][bottom][index][help] */
 477 {
 478         unsigned long address;
 479         struct vm_area_struct* vma;
 480 
 481         /*
 482          * Go through process' page directory.
 483          */
 484         address = p->mm->swap_address;
 485         p->mm->swap_address = 0;
 486 
 487         /*
 488          * Find the proper vm-area
 489          */
 490         vma = find_vma(p, address);
 491         if (!vma)
 492                 return 0;
 493         if (address < vma->vm_start)
 494                 address = vma->vm_start;
 495 
 496         for (;;) {
 497                 if (swap_out_vma(vma, pgd_offset(p, address), address))
 498                         return 1;
 499                 vma = vma->vm_next;
 500                 if (!vma)
 501                         return 0;
 502                 address = vma->vm_start;
 503         }
 504 }
 505 
 506 static int swap_out(unsigned int priority)
     /* [previous][next][first][last][top][bottom][index][help] */
 507 {
 508         static int swap_task;
 509         int loop, counter;
 510         struct task_struct *p;
 511 
 512         counter = 2*NR_TASKS >> priority;
 513         for(; counter >= 0; counter--, swap_task++) {
 514                 /*
 515                  * Check that swap_task is suitable for swapping.  If not, look for
 516                  * the next suitable process.
 517                  */
 518                 loop = 0;
 519                 while(1) {
 520                         if (swap_task >= NR_TASKS) {
 521                                 swap_task = 1;
 522                                 if (loop)
 523                                         /* all processes are unswappable or already swapped out */
 524                                         return 0;
 525                                 loop = 1;
 526                         }
 527 
 528                         p = task[swap_task];
 529                         if (p && p->mm->swappable && p->mm->rss)
 530                                 break;
 531 
 532                         swap_task++;
 533                 }
 534 
 535                 /*
 536                  * Determine the number of pages to swap from this process.
 537                  */
 538                 if (!p->mm->swap_cnt) {
 539                         p->mm->dec_flt = (p->mm->dec_flt * 3) / 4 + p->mm->maj_flt - p->mm->old_maj_flt;
 540                         p->mm->old_maj_flt = p->mm->maj_flt;
 541 
 542                         if (p->mm->dec_flt >= SWAP_RATIO / SWAP_MIN) {
 543                                 p->mm->dec_flt = SWAP_RATIO / SWAP_MIN;
 544                                 p->mm->swap_cnt = SWAP_MIN;
 545                         } else if (p->mm->dec_flt <= SWAP_RATIO / SWAP_MAX)
 546                                 p->mm->swap_cnt = SWAP_MAX;
 547                         else
 548                                 p->mm->swap_cnt = SWAP_RATIO / p->mm->dec_flt;
 549                 }
 550                 if (swap_out_process(p)) {
 551                         if ((--p->mm->swap_cnt) == 0)
 552                                 swap_task++;
 553                         return 1;
 554                 }
 555         }
 556         return 0;
 557 }
 558 
 559 /*
 560  * we keep on shrinking one resource until it's considered "too hard",
 561  * and then switch to the next one (priority being an indication on how
 562  * hard we should try with the resource).
 563  *
 564  * This should automatically find the resource that can most easily be
 565  * free'd, so hopefully we'll get reasonable behaviour even under very
 566  * different circumstances.
 567  */
 568 static int try_to_free_page(int priority)
     /* [previous][next][first][last][top][bottom][index][help] */
 569 {
 570         static int state = 0;
 571         int i=6;
 572 
 573         switch (state) {
 574                 do {
 575                 case 0:
 576                         if (priority != GFP_NOBUFFER && shrink_buffers(i))
 577                                 return 1;
 578                         state = 1;
 579                 case 1:
 580                         if (shm_swap(i))
 581                                 return 1;
 582                         state = 2;
 583                 default:
 584                         if (swap_out(i))
 585                                 return 1;
 586                         state = 0;
 587                 } while(--i);
 588         }
 589         return 0;
 590 }
 591 
 592 static inline void add_mem_queue(struct mem_list * head, struct mem_list * entry)
     /* [previous][next][first][last][top][bottom][index][help] */
 593 {
 594         entry->prev = head;
 595         (entry->next = head->next)->prev = entry;
 596         head->next = entry;
 597 }
 598 
 599 static inline void remove_mem_queue(struct mem_list * head, struct mem_list * entry)
     /* [previous][next][first][last][top][bottom][index][help] */
 600 {
 601         entry->next->prev = entry->prev;
 602         entry->prev->next = entry->next;
 603 }
 604 
 605 /*
 606  * Free_page() adds the page to the free lists. This is optimized for
 607  * fast normal cases (no error jumps taken normally).
 608  *
 609  * The way to optimize jumps for gcc-2.2.2 is to:
 610  *  - select the "normal" case and put it inside the if () { XXX }
 611  *  - no else-statements if you can avoid them
 612  *
 613  * With the above two rules, you get a straight-line execution path
 614  * for the normal case, giving better asm-code.
 615  *
 616  * free_page() may sleep since the page being freed may be a buffer
 617  * page or present in the swap cache. It will not sleep, however,
 618  * for a freshly allocated page (get_free_page()).
 619  */
 620 
 621 /*
 622  * Buddy system. Hairy. You really aren't expected to understand this
 623  */
 624 static inline void free_pages_ok(unsigned long addr, unsigned long order)
     /* [previous][next][first][last][top][bottom][index][help] */
 625 {
 626         unsigned long index = MAP_NR(addr) >> (1 + order);
 627         unsigned long mask = PAGE_MASK << order;
 628 
 629         addr &= mask;
 630         nr_free_pages += 1 << order;
 631         while (order < NR_MEM_LISTS-1) {
 632                 if (!change_bit(index, free_area_map[order]))
 633                         break;
 634                 remove_mem_queue(free_area_list+order, (struct mem_list *) (addr ^ (1+~mask)));
 635                 order++;
 636                 index >>= 1;
 637                 mask <<= 1;
 638                 addr &= mask;
 639         }
 640         add_mem_queue(free_area_list+order, (struct mem_list *) addr);
 641 }
 642 
 643 static inline void check_free_buffers(unsigned long addr)
     /* [previous][next][first][last][top][bottom][index][help] */
 644 {
 645         struct buffer_head * bh;
 646 
 647         bh = buffer_pages[MAP_NR(addr)];
 648         if (bh) {
 649                 struct buffer_head *tmp = bh;
 650                 do {
 651                         if (tmp->b_list == BUF_SHARED && tmp->b_dev != 0xffff)
 652                                 refile_buffer(tmp);
 653                         tmp = tmp->b_this_page;
 654                 } while (tmp != bh);
 655         }
 656 }
 657 
 658 void free_pages(unsigned long addr, unsigned long order)
     /* [previous][next][first][last][top][bottom][index][help] */
 659 {
 660         if (addr < high_memory) {
 661                 unsigned long flag;
 662                 mem_map_t * map = mem_map + MAP_NR(addr);
 663                 if (*map) {
 664                         if (!(*map & MAP_PAGE_RESERVED)) {
 665                                 save_flags(flag);
 666                                 cli();
 667                                 if (!--*map)  {
 668                                         free_pages_ok(addr, order);
 669                                         delete_from_swap_cache(addr);
 670                                 }
 671                                 restore_flags(flag);
 672                                 if (*map == 1)
 673                                         check_free_buffers(addr);
 674                         }
 675                         return;
 676                 }
 677                 printk("Trying to free free memory (%08lx): memory probably corrupted\n",addr);
 678                 printk("PC = %p\n", __builtin_return_address(0));
 679                 return;
 680         }
 681 }
 682 
 683 /*
 684  * Some ugly macros to speed up __get_free_pages()..
 685  */
 686 #define RMQUEUE(order) \
 687 do { struct mem_list * queue = free_area_list+order; \
 688      unsigned long new_order = order; \
 689         do { struct mem_list *next = queue->next; \
 690                 if (queue != next) { \
 691                         (queue->next = next->next)->prev = queue; \
 692                         mark_used((unsigned long) next, new_order); \
 693                         nr_free_pages -= 1 << order; \
 694                         restore_flags(flags); \
 695                         EXPAND(next, order, new_order); \
 696                         return (unsigned long) next; \
 697                 } new_order++; queue++; \
 698         } while (new_order < NR_MEM_LISTS); \
 699 } while (0)
 700 
 701 static inline int mark_used(unsigned long addr, unsigned long order)
     /* [previous][next][first][last][top][bottom][index][help] */
 702 {
 703         return change_bit(MAP_NR(addr) >> (1+order), free_area_map[order]);
 704 }
 705 
 706 #define EXPAND(addr,low,high) \
 707 do { unsigned long size = PAGE_SIZE << high; \
 708         while (high > low) { \
 709                 high--; size >>= 1; cli(); \
 710                 add_mem_queue(free_area_list+high, addr); \
 711                 mark_used((unsigned long) addr, high); \
 712                 restore_flags(flags); \
 713                 addr = (struct mem_list *) (size + (unsigned long) addr); \
 714         } mem_map[MAP_NR((unsigned long) addr)] = 1; \
 715 } while (0)
 716 
 717 unsigned long __get_free_pages(int priority, unsigned long order)
     /* [previous][next][first][last][top][bottom][index][help] */
 718 {
 719         unsigned long flags;
 720         int reserved_pages;
 721 
 722         if (intr_count && priority != GFP_ATOMIC) {
 723                 static int count = 0;
 724                 if (++count < 5) {
 725                         printk("gfp called nonatomically from interrupt %p\n",
 726                                 __builtin_return_address(0));
 727                         priority = GFP_ATOMIC;
 728                 }
 729         }
 730         reserved_pages = 5;
 731         if (priority != GFP_NFS)
 732                 reserved_pages = min_free_pages;
 733         save_flags(flags);
 734 repeat:
 735         cli();
 736         if ((priority==GFP_ATOMIC) || nr_free_pages > reserved_pages) {
 737                 RMQUEUE(order);
 738                 restore_flags(flags);
 739                 return 0;
 740         }
 741         restore_flags(flags);
 742         if (priority != GFP_BUFFER && try_to_free_page(priority))
 743                 goto repeat;
 744         return 0;
 745 }
 746 
 747 /*
 748  * Yes, I know this is ugly. Don't tell me.
 749  */
 750 unsigned long __get_dma_pages(int priority, unsigned long order)
     /* [previous][next][first][last][top][bottom][index][help] */
 751 {
 752         unsigned long list = 0;
 753         unsigned long result;
 754         unsigned long limit = MAX_DMA_ADDRESS;
 755 
 756         /* if (EISA_bus) limit = ~0UL; */
 757         if (priority != GFP_ATOMIC)
 758                 priority = GFP_BUFFER;
 759         for (;;) {
 760                 result = __get_free_pages(priority, order);
 761                 if (result < limit) /* covers failure as well */
 762                         break;
 763                 *(unsigned long *) result = list;
 764                 list = result;
 765         }
 766         while (list) {
 767                 unsigned long tmp = list;
 768                 list = *(unsigned long *) list;
 769                 free_pages(tmp, order);
 770         }
 771         return result;
 772 }
 773 
 774 /*
 775  * Show free area list (used inside shift_scroll-lock stuff)
 776  * We also calculate the percentage fragmentation. We do this by counting the
 777  * memory on each free list with the exception of the first item on the list.
 778  */
 779 void show_free_areas(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 780 {
 781         unsigned long order, flags;
 782         unsigned long total = 0;
 783 
 784         printk("Free pages:      %6dkB\n ( ",nr_free_pages<<(PAGE_SHIFT-10));
 785         save_flags(flags);
 786         cli();
 787         for (order=0 ; order < NR_MEM_LISTS; order++) {
 788                 struct mem_list * tmp;
 789                 unsigned long nr = 0;
 790                 for (tmp = free_area_list[order].next ; tmp != free_area_list + order ; tmp = tmp->next) {
 791                         nr ++;
 792                 }
 793                 total += nr * (4 << order);
 794                 printk("%lu*%ukB ", nr, 4 << order);
 795         }
 796         restore_flags(flags);
 797         printk("= %lukB)\n", total);
 798 #ifdef SWAP_CACHE_INFO
 799         show_swap_cache_info();
 800 #endif  
 801 }
 802 
 803 /*
 804  * Trying to stop swapping from a file is fraught with races, so
 805  * we repeat quite a bit here when we have to pause. swapoff()
 806  * isn't exactly timing-critical, so who cares (but this is /really/
 807  * inefficient, ugh).
 808  *
 809  * We return 1 after having slept, which makes the process start over
 810  * from the beginning for this process..
 811  */
 812 static inline int unuse_pte(struct vm_area_struct * vma, unsigned long address,
     /* [previous][next][first][last][top][bottom][index][help] */
 813         pte_t *dir, unsigned int type, unsigned long page)
 814 {
 815         pte_t pte = *dir;
 816 
 817         if (pte_none(pte))
 818                 return 0;
 819         if (pte_present(pte)) {
 820                 unsigned long page = pte_page(pte);
 821                 if (page >= high_memory)
 822                         return 0;
 823                 if (!in_swap_cache(page))
 824                         return 0;
 825                 if (SWP_TYPE(in_swap_cache(page)) != type)
 826                         return 0;
 827                 delete_from_swap_cache(page);
 828                 *dir = pte_mkdirty(pte);
 829                 return 0;
 830         }
 831         if (SWP_TYPE(pte_val(pte)) != type)
 832                 return 0;
 833         read_swap_page(pte_val(pte), (char *) page);
 834         if (pte_val(*dir) != pte_val(pte)) {
 835                 free_page(page);
 836                 return 1;
 837         }
 838         *dir = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 839         ++vma->vm_task->mm->rss;
 840         swap_free(pte_val(pte));
 841         return 1;
 842 }
 843 
 844 static inline int unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
     /* [previous][next][first][last][top][bottom][index][help] */
 845         unsigned long address, unsigned long size, unsigned long offset,
 846         unsigned int type, unsigned long page)
 847 {
 848         pte_t * pte;
 849         unsigned long end;
 850 
 851         if (pmd_none(*dir))
 852                 return 0;
 853         if (pmd_bad(*dir)) {
 854                 printk("unuse_pmd: bad pmd (%08lx)\n", pmd_val(*dir));
 855                 pmd_clear(dir);
 856                 return 0;
 857         }
 858         pte = pte_offset(dir, address);
 859         offset += address & PMD_MASK;
 860         address &= ~PMD_MASK;
 861         end = address + size;
 862         if (end > PMD_SIZE)
 863                 end = PMD_SIZE;
 864         do {
 865                 if (unuse_pte(vma, offset+address-vma->vm_start, pte, type, page))
 866                         return 1;
 867                 address += PAGE_SIZE;
 868                 pte++;
 869         } while (address < end);
 870         return 0;
 871 }
 872 
 873 static inline int unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
     /* [previous][next][first][last][top][bottom][index][help] */
 874         unsigned long address, unsigned long size,
 875         unsigned int type, unsigned long page)
 876 {
 877         pmd_t * pmd;
 878         unsigned long offset, end;
 879 
 880         if (pgd_none(*dir))
 881                 return 0;
 882         if (pgd_bad(*dir)) {
 883                 printk("unuse_pgd: bad pgd (%08lx)\n", pgd_val(*dir));
 884                 pgd_clear(dir);
 885                 return 0;
 886         }
 887         pmd = pmd_offset(dir, address);
 888         offset = address & PGDIR_MASK;
 889         address &= ~PGDIR_MASK;
 890         end = address + size;
 891         if (end > PGDIR_SIZE)
 892                 end = PGDIR_SIZE;
 893         do {
 894                 if (unuse_pmd(vma, pmd, address, end - address, offset, type, page))
 895                         return 1;
 896                 address = (address + PMD_SIZE) & PMD_MASK;
 897                 pmd++;
 898         } while (address < end);
 899         return 0;
 900 }
 901 
 902 static int unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir,
     /* [previous][next][first][last][top][bottom][index][help] */
 903         unsigned long start, unsigned long end,
 904         unsigned int type, unsigned long page)
 905 {
 906         while (start < end) {
 907                 if (unuse_pgd(vma, pgdir, start, end - start, type, page))
 908                         return 1;
 909                 start = (start + PGDIR_SIZE) & PGDIR_MASK;
 910                 pgdir++;
 911         }
 912         return 0;
 913 }
 914 
 915 static int unuse_process(struct task_struct * p, unsigned int type, unsigned long page)
     /* [previous][next][first][last][top][bottom][index][help] */
 916 {
 917         struct vm_area_struct* vma;
 918 
 919         /*
 920          * Go through process' page directory.
 921          */
 922         vma = p->mm->mmap;
 923         while (vma) {
 924                 pgd_t * pgd = pgd_offset(p, vma->vm_start);
 925                 if (unuse_vma(vma, pgd, vma->vm_start, vma->vm_end, type, page))
 926                         return 1;
 927                 vma = vma->vm_next;
 928         }
 929         return 0;
 930 }
 931 
 932 /*
 933  * To avoid races, we repeat for each process after having
 934  * swapped something in. That gets rid of a few pesky races,
 935  * and "swapoff" isn't exactly timing critical.
 936  */
 937 static int try_to_unuse(unsigned int type)
     /* [previous][next][first][last][top][bottom][index][help] */
 938 {
 939         int nr;
 940         unsigned long page = get_free_page(GFP_KERNEL);
 941 
 942         if (!page)
 943                 return -ENOMEM;
 944         nr = 0;
 945         while (nr < NR_TASKS) {
 946                 if (task[nr]) {
 947                         if (unuse_process(task[nr], type, page)) {
 948                                 page = get_free_page(GFP_KERNEL);
 949                                 if (!page)
 950                                         return -ENOMEM;
 951                                 continue;
 952                         }
 953                 }
 954                 nr++;
 955         }
 956         free_page(page);
 957         return 0;
 958 }
 959 
 960 asmlinkage int sys_swapoff(const char * specialfile)
     /* [previous][next][first][last][top][bottom][index][help] */
 961 {
 962         struct swap_info_struct * p;
 963         struct inode * inode;
 964         unsigned int type;
 965         struct file filp;
 966         int i;
 967 
 968         if (!suser())
 969                 return -EPERM;
 970         i = namei(specialfile,&inode);
 971         if (i)
 972                 return i;
 973         p = swap_info;
 974         for (type = 0 ; type < nr_swapfiles ; type++,p++) {
 975                 if ((p->flags & SWP_WRITEOK) != SWP_WRITEOK)
 976                         continue;
 977                 if (p->swap_file) {
 978                         if (p->swap_file == inode)
 979                                 break;
 980                 } else {
 981                         if (!S_ISBLK(inode->i_mode))
 982                                 continue;
 983                         if (p->swap_device == inode->i_rdev)
 984                                 break;
 985                 }
 986         }
 987 
 988         if (type >= nr_swapfiles){
 989                 iput(inode);
 990                 return -EINVAL;
 991         }
 992         p->flags = SWP_USED;
 993         i = try_to_unuse(type);
 994         if (i) {
 995                 iput(inode);
 996                 p->flags = SWP_WRITEOK;
 997                 return i;
 998         }
 999 
1000         if(p->swap_device){
1001                 memset(&filp, 0, sizeof(filp));         
1002                 filp.f_inode = inode;
1003                 filp.f_mode = 3; /* read write */
1004                 /* open it again to get fops */
1005                 if( !blkdev_open(inode, &filp) &&
1006                    filp.f_op && filp.f_op->release){
1007                         filp.f_op->release(inode,&filp);
1008                         filp.f_op->release(inode,&filp);
1009                 }
1010         }
1011         iput(inode);
1012 
1013         nr_swap_pages -= p->pages;
1014         iput(p->swap_file);
1015         p->swap_file = NULL;
1016         p->swap_device = 0;
1017         vfree(p->swap_map);
1018         p->swap_map = NULL;
1019         free_page((long) p->swap_lockmap);
1020         p->swap_lockmap = NULL;
1021         p->flags = 0;
1022         return 0;
1023 }
1024 
1025 /*
1026  * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
1027  *
1028  * The swapon system call
1029  */
1030 asmlinkage int sys_swapon(const char * specialfile)
     /* [previous][next][first][last][top][bottom][index][help] */
1031 {
1032         struct swap_info_struct * p;
1033         struct inode * swap_inode;
1034         unsigned int type;
1035         int i,j;
1036         int error;
1037         struct file filp;
1038 
1039         memset(&filp, 0, sizeof(filp));
1040         if (!suser())
1041                 return -EPERM;
1042         p = swap_info;
1043         for (type = 0 ; type < nr_swapfiles ; type++,p++)
1044                 if (!(p->flags & SWP_USED))
1045                         break;
1046         if (type >= MAX_SWAPFILES)
1047                 return -EPERM;
1048         if (type >= nr_swapfiles)
1049                 nr_swapfiles = type+1;
1050         p->flags = SWP_USED;
1051         p->swap_file = NULL;
1052         p->swap_device = 0;
1053         p->swap_map = NULL;
1054         p->swap_lockmap = NULL;
1055         p->lowest_bit = 0;
1056         p->highest_bit = 0;
1057         p->max = 1;
1058         error = namei(specialfile,&swap_inode);
1059         if (error)
1060                 goto bad_swap_2;
1061         p->swap_file = swap_inode;
1062         error = -EBUSY;
1063         if (swap_inode->i_count != 1)
1064                 goto bad_swap_2;
1065         error = -EINVAL;
1066 
1067         if (S_ISBLK(swap_inode->i_mode)) {
1068                 p->swap_device = swap_inode->i_rdev;
1069 
1070                 filp.f_inode = swap_inode;
1071                 filp.f_mode = 3; /* read write */
1072                 error = blkdev_open(swap_inode, &filp);
1073                 p->swap_file = NULL;
1074                 iput(swap_inode);
1075                 if(error)
1076                         goto bad_swap_2;
1077                 error = -ENODEV;
1078                 if (!p->swap_device)
1079                         goto bad_swap;
1080                 error = -EBUSY;
1081                 for (i = 0 ; i < nr_swapfiles ; i++) {
1082                         if (i == type)
1083                                 continue;
1084                         if (p->swap_device == swap_info[i].swap_device)
1085                                 goto bad_swap;
1086                 }
1087         } else if (!S_ISREG(swap_inode->i_mode))
1088                 goto bad_swap;
1089         p->swap_lockmap = (unsigned char *) get_free_page(GFP_USER);
1090         if (!p->swap_lockmap) {
1091                 printk("Unable to start swapping: out of memory :-)\n");
1092                 error = -ENOMEM;
1093                 goto bad_swap;
1094         }
1095         read_swap_page(SWP_ENTRY(type,0), (char *) p->swap_lockmap);
1096         if (memcmp("SWAP-SPACE",p->swap_lockmap+4086,10)) {
1097                 printk("Unable to find swap-space signature\n");
1098                 error = -EINVAL;
1099                 goto bad_swap;
1100         }
1101         memset(p->swap_lockmap+PAGE_SIZE-10,0,10);
1102         j = 0;
1103         p->lowest_bit = 0;
1104         p->highest_bit = 0;
1105         for (i = 1 ; i < 8*PAGE_SIZE ; i++) {
1106                 if (test_bit(i,p->swap_lockmap)) {
1107                         if (!p->lowest_bit)
1108                                 p->lowest_bit = i;
1109                         p->highest_bit = i;
1110                         p->max = i+1;
1111                         j++;
1112                 }
1113         }
1114         if (!j) {
1115                 printk("Empty swap-file\n");
1116                 error = -EINVAL;
1117                 goto bad_swap;
1118         }
1119         p->swap_map = (unsigned char *) vmalloc(p->max);
1120         if (!p->swap_map) {
1121                 error = -ENOMEM;
1122                 goto bad_swap;
1123         }
1124         for (i = 1 ; i < p->max ; i++) {
1125                 if (test_bit(i,p->swap_lockmap))
1126                         p->swap_map[i] = 0;
1127                 else
1128                         p->swap_map[i] = 0x80;
1129         }
1130         p->swap_map[0] = 0x80;
1131         memset(p->swap_lockmap,0,PAGE_SIZE);
1132         p->flags = SWP_WRITEOK;
1133         p->pages = j;
1134         nr_swap_pages += j;
1135         printk("Adding Swap: %dk swap-space\n",j<<2);
1136         return 0;
1137 bad_swap:
1138         if(filp.f_op && filp.f_op->release)
1139                 filp.f_op->release(filp.f_inode,&filp);
1140 bad_swap_2:
1141         free_page((long) p->swap_lockmap);
1142         vfree(p->swap_map);
1143         iput(p->swap_file);
1144         p->swap_device = 0;
1145         p->swap_file = NULL;
1146         p->swap_map = NULL;
1147         p->swap_lockmap = NULL;
1148         p->flags = 0;
1149         return error;
1150 }
1151 
1152 void si_swapinfo(struct sysinfo *val)
     /* [previous][next][first][last][top][bottom][index][help] */
1153 {
1154         unsigned int i, j;
1155 
1156         val->freeswap = val->totalswap = 0;
1157         for (i = 0; i < nr_swapfiles; i++) {
1158                 if ((swap_info[i].flags & SWP_WRITEOK) != SWP_WRITEOK)
1159                         continue;
1160                 for (j = 0; j < swap_info[i].max; ++j)
1161                         switch (swap_info[i].swap_map[j]) {
1162                                 case 128:
1163                                         continue;
1164                                 case 0:
1165                                         ++val->freeswap;
1166                                 default:
1167                                         ++val->totalswap;
1168                         }
1169         }
1170         val->freeswap <<= PAGE_SHIFT;
1171         val->totalswap <<= PAGE_SHIFT;
1172         return;
1173 }
1174 
1175 /*
1176  * set up the free-area data structures:
1177  *   - mark all pages MAP_PAGE_RESERVED
1178  *   - mark all memory queues empty
1179  *   - clear the memory bitmaps
1180  */
1181 unsigned long free_area_init(unsigned long start_mem, unsigned long end_mem)
     /* [previous][next][first][last][top][bottom][index][help] */
1182 {
1183         mem_map_t * p;
1184         unsigned long mask = PAGE_MASK;
1185         int i;
1186 
1187         /*
1188          * select nr of pages we try to keep free for important stuff
1189          * with a minimum of 16 pages. This is totally arbitrary
1190          */
1191         i = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT+6);
1192         if (i < 16)
1193                 i = 16;
1194         min_free_pages = i;
1195         start_mem = init_swap_cache(start_mem, end_mem);
1196         mem_map = (mem_map_t *) start_mem;
1197         p = mem_map + MAP_NR(end_mem);
1198         start_mem = (unsigned long) p;
1199         while (p > mem_map)
1200                 *--p = MAP_PAGE_RESERVED;
1201 
1202         for (i = 0 ; i < NR_MEM_LISTS ; i++) {
1203                 unsigned long bitmap_size;
1204                 free_area_list[i].prev = free_area_list[i].next = &free_area_list[i];
1205                 mask += mask;
1206                 end_mem = (end_mem + ~mask) & mask;
1207                 bitmap_size = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT + i);
1208                 bitmap_size = (bitmap_size + 7) >> 3;
1209                 bitmap_size = (bitmap_size + sizeof(unsigned long) - 1) & ~(sizeof(unsigned long)-1);
1210                 free_area_map[i] = (unsigned char *) start_mem;
1211                 memset((void *) start_mem, 0, bitmap_size);
1212                 start_mem += bitmap_size;
1213         }
1214         return start_mem;
1215 }

/* [previous][next][first][last][top][bottom][index][help] */