mm/memory.c

/* */
This source file includes following definitions.
oom
free_one_pmd
free_one_pgd
new_page_tables
clear_page_tables
free_page_tables
copy_one_pte
copy_pte_range
copy_pmd_range
copy_page_range
forget_pte
unmap_pte_range
unmap_pmd_range
zap_page_range
unmap_page_range
zeromap_pte_range
zeromap_pmd_range
zeromap_page_range
remap_pte_range
remap_pmd_range
remap_page_range
put_page
put_dirty_page
do_wp_page
verify_area
get_empty_page
try_to_share
share_page
unshare
vmtruncate
get_empty_pgtable
do_swap_page
do_no_page
handle_pte_fault
handle_mm_fault
   1 /*
   2  *  linux/mm/memory.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  */
   6 
   7 /*
   8  * demand-loading started 01.12.91 - seems it is high on the list of
   9  * things wanted, and it should be easy to implement. - Linus
  10  */
  11 
  12 /*
  13  * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
  14  * pages started 02.12.91, seems to work. - Linus.
  15  *
  16  * Tested sharing by executing about 30 /bin/sh: under the old kernel it
  17  * would have taken more than the 6M I have free, but it worked well as
  18  * far as I could see.
  19  *
  20  * Also corrected some "invalidate()"s - I wasn't doing enough of them.
  21  */
  22 
  23 /*
  24  * Real VM (paging to/from disk) started 18.12.91. Much more work and
  25  * thought has to go into this. Oh, well..
  26  * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
  27  *              Found it. Everything seems to work now.
  28  * 20.12.91  -  Ok, making the swap-device changeable like the root.
  29  */
  30 
  31 /*
  32  * 05.04.94  -  Multi-page memory management added for v1.1.
  33  *              Idea by Alex Bligh (alex@cconcepts.co.uk)
  34  */
  35 
  36 #include <linux/signal.h>
  37 #include <linux/sched.h>
  38 #include <linux/head.h>
  39 #include <linux/kernel.h>
  40 #include <linux/errno.h>
  41 #include <linux/string.h>
  42 #include <linux/types.h>
  43 #include <linux/ptrace.h>
  44 #include <linux/mman.h>
  45 #include <linux/mm.h>
  46 
  47 #include <asm/system.h>
  48 #include <asm/segment.h>
  49 #include <asm/pgtable.h>
  50 
  51 unsigned long high_memory = 0;
  52 
  53 /*
  54  * The free_area_list arrays point to the queue heads of the free areas
  55  * of different sizes
  56  */
  57 int nr_swap_pages = 0;
  58 int nr_free_pages = 0;
  59 struct mem_list free_area_list[NR_MEM_LISTS];
  60 unsigned char * free_area_map[NR_MEM_LISTS];
  61 
  62 #define copy_page(from,to) memcpy((void *) to, (void *) from, PAGE_SIZE)
  63 
  64 #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE)
  65 
  66 mem_map_t * mem_map = NULL;
  67 
  68 /*
  69  * oom() prints a message (so that the user knows why the process died),
  70  * and gives the process an untrappable SIGKILL.
  71  */
  72 void oom(struct task_struct * task)
     /*  */
  73 {
  74         printk("\nOut of memory for %s.\n", current->comm);
  75         task->sig->action[SIGKILL-1].sa_handler = NULL;
  76         task->blocked &= ~(1<<(SIGKILL-1));
  77         send_sig(SIGKILL,task,1);
  78 }
  79 
  80 /*
  81  * Note: this doesn't free the actual pages themselves. That
  82  * has been handled earlier when unmapping all the memory regions.
  83  */
  84 static inline void free_one_pmd(pmd_t * dir)
     /*  */
  85 {
  86         pte_t * pte;
  87 
  88         if (pmd_none(*dir))
  89                 return;
  90         if (pmd_bad(*dir)) {
  91                 printk("free_one_pmd: bad directory entry %08lx\n", pmd_val(*dir));
  92                 pmd_clear(dir);
  93                 return;
  94         }
  95         pte = pte_offset(dir, 0);
  96         pmd_clear(dir);
  97         pte_free(pte);
  98 }
  99 
 100 static inline void free_one_pgd(pgd_t * dir)
     /*  */
 101 {
 102         pmd_t * pmd;
 103 
 104         if (pgd_none(*dir))
 105                 return;
 106         if (pgd_bad(*dir)) {
 107                 printk("free_one_pgd: bad directory entry %08lx\n", pgd_val(*dir));
 108                 pgd_clear(dir);
 109                 return;
 110         }
 111         pmd = pmd_offset(dir, 0);
 112         pgd_clear(dir);
 113         if (!pmd_inuse(pmd)) {
 114                 int j;
 115                 for (j = 0; j < PTRS_PER_PMD ; j++)
 116                         free_one_pmd(pmd+j);
 117         }
 118         pmd_free(pmd);
 119 }
 120         
 121 int new_page_tables(struct task_struct * tsk)
     /*  */
 122 {
 123         pgd_t * page_dir, * new_pg;
 124         int i;
 125 
 126         if (!(new_pg = pgd_alloc()))
 127                 return -ENOMEM;
 128         page_dir = pgd_offset(&init_mm, 0);
 129         for (i = USER_PTRS_PER_PGD ; i < PTRS_PER_PGD ; i++)
 130                 new_pg[i] = page_dir[i];
 131         SET_PAGE_DIR(tsk, new_pg);
 132         tsk->mm->pgd = new_pg;
 133         return 0;
 134 }
 135 
 136 /*
 137  * This function clears all user-level page tables of a process - this
 138  * is needed by execve(), so that old pages aren't in the way. Note that
 139  * unlike 'free_page_tables()', this function still leaves a valid
 140  * page-table-tree in memory: it just removes the user pages. The two
 141  * functions are similar, but there is a fundamental difference.
 142  */
 143 void clear_page_tables(struct task_struct * tsk)
     /*  */
 144 {
 145         int i;
 146         pgd_t * page_dir;
 147 
 148         if (!tsk)
 149                 return;
 150         if (tsk == task[0])
 151                 panic("task[0] (swapper) doesn't support exec()\n");
 152         page_dir = pgd_offset(tsk->mm, 0);
 153         if (!page_dir) {
 154                 printk("%s trying to clear NULL page-directory: not good\n", tsk->comm);
 155                 return;
 156         }
 157         if (pgd_inuse(page_dir)) {
 158                 if (new_page_tables(tsk))
 159                         oom(tsk);
 160                 pgd_free(page_dir);
 161                 return;
 162         }
 163         if (page_dir == swapper_pg_dir) {
 164                 printk("%s trying to clear kernel page-directory: not good\n", tsk->comm);
 165                 return;
 166         }
 167         for (i = 0 ; i < USER_PTRS_PER_PGD ; i++)
 168                 free_one_pgd(page_dir + i);
 169         invalidate();
 170         return;
 171 }
 172 
 173 /*
 174  * This function frees up all page tables of a process when it exits.
 175  */
 176 void free_page_tables(struct task_struct * tsk)
     /*  */
 177 {
 178         int i;
 179         pgd_t * page_dir;
 180 
 181         page_dir = tsk->mm->pgd;
 182         if (!page_dir || page_dir == swapper_pg_dir) {
 183                 printk("%s trying to free kernel page-directory: not good\n", tsk->comm);
 184                 return;
 185         }
 186         SET_PAGE_DIR(tsk, swapper_pg_dir);
 187         if (pgd_inuse(page_dir)) {
 188                 pgd_free(page_dir);
 189                 return;
 190         }
 191         tsk->mm->pgd = swapper_pg_dir;  /* or else... */
 192         for (i = 0 ; i < PTRS_PER_PGD ; i++)
 193                 free_one_pgd(page_dir + i);
 194         pgd_free(page_dir);
 195         invalidate();
 196 }
 197 
 198 static inline void copy_one_pte(pte_t * old_pte, pte_t * new_pte)
     /*  */
 199 {
 200         pte_t pte = *old_pte;
 201 
 202         if (pte_none(pte))
 203                 return;
 204         if (!pte_present(pte)) {
 205                 swap_duplicate(pte_val(pte));
 206                 set_pte(new_pte, pte);
 207                 return;
 208         }
 209         if (pte_page(pte) > high_memory || mem_map[MAP_NR(pte_page(pte))].reserved) {
 210                 set_pte(new_pte, pte);
 211                 return;
 212         }
 213         if (pte_cow(pte))
 214                 pte = pte_wrprotect(pte);
 215         if (delete_from_swap_cache(pte_page(pte)))
 216                 pte = pte_mkdirty(pte);
 217         set_pte(new_pte, pte_mkold(pte));
 218         set_pte(old_pte, pte);
 219         mem_map[MAP_NR(pte_page(pte))].count++;
 220 }
 221 
 222 static inline int copy_pte_range(pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long address, unsigned long size)
     /*  */
 223 {
 224         pte_t * src_pte, * dst_pte;
 225         unsigned long end;
 226 
 227         if (pmd_none(*src_pmd))
 228                 return 0;
 229         if (pmd_bad(*src_pmd)) {
 230                 printk("copy_pte_range: bad pmd (%08lx)\n", pmd_val(*src_pmd));
 231                 pmd_clear(src_pmd);
 232                 return 0;
 233         }
 234         src_pte = pte_offset(src_pmd, address);
 235         if (pmd_none(*dst_pmd)) {
 236                 if (!pte_alloc(dst_pmd, 0))
 237                         return -ENOMEM;
 238         }
 239         dst_pte = pte_offset(dst_pmd, address);
 240         address &= ~PMD_MASK;
 241         end = address + size;
 242         if (end >= PMD_SIZE)
 243                 end = PMD_SIZE;
 244         do {
 245                 /* I would like to switch arguments here, to make it
 246                  * consistent with copy_xxx_range and memcpy syntax.
 247                  */
 248                 copy_one_pte(src_pte++, dst_pte++);
 249                 address += PAGE_SIZE;
 250         } while (address < end);
 251         return 0;
 252 }
 253 
 254 static inline int copy_pmd_range(pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long address, unsigned long size)
     /*  */
 255 {
 256         pmd_t * src_pmd, * dst_pmd;
 257         unsigned long end;
 258         int error = 0;
 259 
 260         if (pgd_none(*src_pgd))
 261                 return 0;
 262         if (pgd_bad(*src_pgd)) {
 263                 printk("copy_pmd_range: bad pgd (%08lx)\n", pgd_val(*src_pgd));
 264                 pgd_clear(src_pgd);
 265                 return 0;
 266         }
 267         src_pmd = pmd_offset(src_pgd, address);
 268         if (pgd_none(*dst_pgd)) {
 269                 if (!pmd_alloc(dst_pgd, 0))
 270                         return -ENOMEM;
 271         }
 272         dst_pmd = pmd_offset(dst_pgd, address);
 273         address &= ~PGDIR_MASK;
 274         end = address + size;
 275         if (end > PGDIR_SIZE)
 276                 end = PGDIR_SIZE;
 277         do {
 278                 error = copy_pte_range(dst_pmd++, src_pmd++, address, end - address);
 279                 if (error)
 280                         break;
 281                 address = (address + PMD_SIZE) & PMD_MASK; 
 282         } while (address < end);
 283         return error;
 284 }
 285 
 286 /*
 287  * copy one vm_area from one task to the other. Assumes the page tables
 288  * already present in the new task to be cleared in the whole range
 289  * covered by this vma.
 290  */
 291 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
     /*  */
 292                         struct vm_area_struct *vma)
 293 {
 294         pgd_t * src_pgd, * dst_pgd;
 295         unsigned long address = vma->vm_start;
 296         unsigned long end = vma->vm_end;
 297         int error = 0;
 298 
 299         src_pgd = pgd_offset(src, address);
 300         dst_pgd = pgd_offset(dst, address);
 301         while (address < end) {
 302                 error = copy_pmd_range(dst_pgd++, src_pgd++, address, end - address);
 303                 if (error)
 304                         break;
 305                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 306         }
 307         invalidate();
 308         return error;
 309 }
 310 
 311 static inline void forget_pte(pte_t page)
     /*  */
 312 {
 313         if (pte_none(page))
 314                 return;
 315         if (pte_present(page)) {
 316                 free_page(pte_page(page));
 317                 if (mem_map[MAP_NR(pte_page(page))].reserved)
 318                         return;
 319                 if (current->mm->rss <= 0)
 320                         return;
 321                 current->mm->rss--;
 322                 return;
 323         }
 324         swap_free(pte_val(page));
 325 }
 326 
 327 static inline void unmap_pte_range(pmd_t * pmd, unsigned long address, unsigned long size)
     /*  */
 328 {
 329         pte_t * pte;
 330         unsigned long end;
 331 
 332         if (pmd_none(*pmd))
 333                 return;
 334         if (pmd_bad(*pmd)) {
 335                 printk("unmap_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
 336                 pmd_clear(pmd);
 337                 return;
 338         }
 339         pte = pte_offset(pmd, address);
 340         address &= ~PMD_MASK;
 341         end = address + size;
 342         if (end >= PMD_SIZE)
 343                 end = PMD_SIZE;
 344         do {
 345                 pte_t page = *pte;
 346                 pte_clear(pte);
 347                 forget_pte(page);
 348                 address += PAGE_SIZE;
 349                 pte++;
 350         } while (address < end);
 351 }
 352 
 353 static inline void unmap_pmd_range(pgd_t * dir, unsigned long address, unsigned long size)
     /*  */
 354 {
 355         pmd_t * pmd;
 356         unsigned long end;
 357 
 358         if (pgd_none(*dir))
 359                 return;
 360         if (pgd_bad(*dir)) {
 361                 printk("unmap_pmd_range: bad pgd (%08lx)\n", pgd_val(*dir));
 362                 pgd_clear(dir);
 363                 return;
 364         }
 365         pmd = pmd_offset(dir, address);
 366         address &= ~PGDIR_MASK;
 367         end = address + size;
 368         if (end > PGDIR_SIZE)
 369                 end = PGDIR_SIZE;
 370         do {
 371                 unmap_pte_range(pmd, address, end - address);
 372                 address = (address + PMD_SIZE) & PMD_MASK; 
 373                 pmd++;
 374         } while (address < end);
 375 }
 376 
 377 /*
 378  * remove user pages in a given range.
 379  */
 380 int zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size)
     /*  */
 381 {
 382         pgd_t * dir;
 383         unsigned long end = address + size;
 384 
 385         dir = pgd_offset(mm, address);
 386         while (address < end) {
 387                 unmap_pmd_range(dir, address, end - address);
 388                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 389                 dir++;
 390         }
 391         invalidate();
 392         return 0;
 393 }
 394 
 395 /*
 396  * a more complete version of free_page_tables which performs with page
 397  * granularity.
 398  */
 399 int unmap_page_range(unsigned long address, unsigned long size)
     /*  */
 400 {
 401         return zap_page_range(current->mm, address, size);
 402 }
 403 
 404 static inline void zeromap_pte_range(pte_t * pte, unsigned long address, unsigned long size, pte_t zero_pte)
     /*  */
 405 {
 406         unsigned long end;
 407 
 408         address &= ~PMD_MASK;
 409         end = address + size;
 410         if (end > PMD_SIZE)
 411                 end = PMD_SIZE;
 412         do {
 413                 pte_t oldpage = *pte;
 414                 set_pte(pte, zero_pte);
 415                 forget_pte(oldpage);
 416                 address += PAGE_SIZE;
 417                 pte++;
 418         } while (address < end);
 419 }
 420 
 421 static inline int zeromap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size, pte_t zero_pte)
     /*  */
 422 {
 423         unsigned long end;
 424 
 425         address &= ~PGDIR_MASK;
 426         end = address + size;
 427         if (end > PGDIR_SIZE)
 428                 end = PGDIR_SIZE;
 429         do {
 430                 pte_t * pte = pte_alloc(pmd, address);
 431                 if (!pte)
 432                         return -ENOMEM;
 433                 zeromap_pte_range(pte, address, end - address, zero_pte);
 434                 address = (address + PMD_SIZE) & PMD_MASK;
 435                 pmd++;
 436         } while (address < end);
 437         return 0;
 438 }
 439 
 440 int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot)
     /*  */
 441 {
 442         int error = 0;
 443         pgd_t * dir;
 444         unsigned long end = address + size;
 445         pte_t zero_pte;
 446 
 447         zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE, prot));
 448         dir = pgd_offset(current->mm, address);
 449         while (address < end) {
 450                 pmd_t *pmd = pmd_alloc(dir, address);
 451                 error = -ENOMEM;
 452                 if (!pmd)
 453                         break;
 454                 error = zeromap_pmd_range(pmd, address, end - address, zero_pte);
 455                 if (error)
 456                         break;
 457                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 458                 dir++;
 459         }
 460         invalidate();
 461         return error;
 462 }
 463 
 464 /*
 465  * maps a range of physical memory into the requested pages. the old
 466  * mappings are removed. any references to nonexistent pages results
 467  * in null mappings (currently treated as "copy-on-access")
 468  */
 469 static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size,
     /*  */
 470         unsigned long offset, pgprot_t prot)
 471 {
 472         unsigned long end;
 473 
 474         address &= ~PMD_MASK;
 475         end = address + size;
 476         if (end > PMD_SIZE)
 477                 end = PMD_SIZE;
 478         do {
 479                 pte_t oldpage = *pte;
 480                 pte_clear(pte);
 481                 if (offset >= high_memory || mem_map[MAP_NR(offset)].reserved)
 482                         set_pte(pte, mk_pte(offset, prot));
 483                 forget_pte(oldpage);
 484                 address += PAGE_SIZE;
 485                 offset += PAGE_SIZE;
 486                 pte++;
 487         } while (address < end);
 488 }
 489 
 490 static inline int remap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size,
     /*  */
 491         unsigned long offset, pgprot_t prot)
 492 {
 493         unsigned long end;
 494 
 495         address &= ~PGDIR_MASK;
 496         end = address + size;
 497         if (end > PGDIR_SIZE)
 498                 end = PGDIR_SIZE;
 499         offset -= address;
 500         do {
 501                 pte_t * pte = pte_alloc(pmd, address);
 502                 if (!pte)
 503                         return -ENOMEM;
 504                 remap_pte_range(pte, address, end - address, address + offset, prot);
 505                 address = (address + PMD_SIZE) & PMD_MASK;
 506                 pmd++;
 507         } while (address < end);
 508         return 0;
 509 }
 510 
 511 int remap_page_range(unsigned long from, unsigned long offset, unsigned long size, pgprot_t prot)
     /*  */
 512 {
 513         int error = 0;
 514         pgd_t * dir;
 515         unsigned long end = from + size;
 516 
 517         offset -= from;
 518         dir = pgd_offset(current->mm, from);
 519         while (from < end) {
 520                 pmd_t *pmd = pmd_alloc(dir, from);
 521                 error = -ENOMEM;
 522                 if (!pmd)
 523                         break;
 524                 error = remap_pmd_range(pmd, from, end - from, offset + from, prot);
 525                 if (error)
 526                         break;
 527                 from = (from + PGDIR_SIZE) & PGDIR_MASK;
 528                 dir++;
 529         }
 530         invalidate();
 531         return error;
 532 }
 533 
 534 /*
 535  * sanity-check function..
 536  */
 537 static void put_page(pte_t * page_table, pte_t pte)
     /*  */
 538 {
 539         if (!pte_none(*page_table)) {
 540                 printk("put_page: page already exists %08lx\n", pte_val(*page_table));
 541                 free_page(pte_page(pte));
 542                 return;
 543         }
 544 /* no need for invalidate */
 545         *page_table = pte;
 546 }
 547 
 548 /*
 549  * This routine is used to map in a page into an address space: needed by
 550  * execve() for the initial stack and environment pages.
 551  */
 552 unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsigned long address)
     /*  */
 553 {
 554         pgd_t * pgd;
 555         pmd_t * pmd;
 556         pte_t * pte;
 557 
 558         if (page >= high_memory)
 559                 printk("put_dirty_page: trying to put page %08lx at %08lx\n",page,address);
 560         if (mem_map[MAP_NR(page)].count != 1)
 561                 printk("mem_map disagrees with %08lx at %08lx\n",page,address);
 562         pgd = pgd_offset(tsk->mm,address);
 563         pmd = pmd_alloc(pgd, address);
 564         if (!pmd) {
 565                 free_page(page);
 566                 oom(tsk);
 567                 return 0;
 568         }
 569         pte = pte_alloc(pmd, address);
 570         if (!pte) {
 571                 free_page(page);
 572                 oom(tsk);
 573                 return 0;
 574         }
 575         if (!pte_none(*pte)) {
 576                 printk("put_dirty_page: page already exists\n");
 577                 pte_clear(pte);
 578                 invalidate();
 579         }
 580         set_pte(pte, pte_mkwrite(pte_mkdirty(mk_pte(page, PAGE_COPY))));
 581 /* no need for invalidate */
 582         return page;
 583 }
 584 
 585 /*
 586  * This routine handles present pages, when users try to write
 587  * to a shared page. It is done by copying the page to a new address
 588  * and decrementing the shared-page counter for the old page.
 589  *
 590  * Goto-purists beware: the only reason for goto's here is that it results
 591  * in better assembly code.. The "default" path will see no jumps at all.
 592  *
 593  * Note that this routine assumes that the protection checks have been
 594  * done by the caller (the low-level page fault routine in most cases).
 595  * Thus we can safely just mark it writable once we've done any necessary
 596  * COW.
 597  *
 598  * We also mark the page dirty at this point even though the page will
 599  * change only once the write actually happens. This avoids a few races,
 600  * and potentially makes it more efficient.
 601  */
 602 void do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
     /*  */
 603         unsigned long address, int write_access)
 604 {
 605         pgd_t *page_dir;
 606         pmd_t *page_middle;
 607         pte_t *page_table, pte;
 608         unsigned long old_page, new_page;
 609 
 610         new_page = __get_free_page(GFP_KERNEL);
 611         page_dir = pgd_offset(vma->vm_mm, address);
 612         if (pgd_none(*page_dir))
 613                 goto end_wp_page;
 614         if (pgd_bad(*page_dir))
 615                 goto bad_wp_pagedir;
 616         page_middle = pmd_offset(page_dir, address);
 617         if (pmd_none(*page_middle))
 618                 goto end_wp_page;
 619         if (pmd_bad(*page_middle))
 620                 goto bad_wp_pagemiddle;
 621         page_table = pte_offset(page_middle, address);
 622         pte = *page_table;
 623         if (!pte_present(pte))
 624                 goto end_wp_page;
 625         if (pte_write(pte))
 626                 goto end_wp_page;
 627         old_page = pte_page(pte);
 628         if (old_page >= high_memory)
 629                 goto bad_wp_page;
 630         tsk->min_flt++;
 631         /*
 632          * Do we need to copy?
 633          */
 634         if (mem_map[MAP_NR(old_page)].count != 1) {
 635                 if (new_page) {
 636                         if (mem_map[MAP_NR(old_page)].reserved)
 637                                 ++vma->vm_mm->rss;
 638                         copy_page(old_page,new_page);
 639                         set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
 640                         free_page(old_page);
 641                         invalidate();
 642                         return;
 643                 }
 644                 set_pte(page_table, BAD_PAGE);
 645                 free_page(old_page);
 646                 oom(tsk);
 647                 invalidate();
 648                 return;
 649         }
 650         set_pte(page_table, pte_mkdirty(pte_mkwrite(pte)));
 651         invalidate();
 652         if (new_page)
 653                 free_page(new_page);
 654         return;
 655 bad_wp_page:
 656         printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page);
 657         send_sig(SIGKILL, tsk, 1);
 658         goto end_wp_page;
 659 bad_wp_pagemiddle:
 660         printk("do_wp_page: bogus page-middle at address %08lx (%08lx)\n", address, pmd_val(*page_middle));
 661         send_sig(SIGKILL, tsk, 1);
 662         goto end_wp_page;
 663 bad_wp_pagedir:
 664         printk("do_wp_page: bogus page-dir entry at address %08lx (%08lx)\n", address, pgd_val(*page_dir));
 665         send_sig(SIGKILL, tsk, 1);
 666 end_wp_page:
 667         if (new_page)
 668                 free_page(new_page);
 669         return;
 670 }
 671 
 672 /*
 673  * Ugly, ugly, but the goto's result in better assembly..
 674  */
 675 int verify_area(int type, const void * addr, unsigned long size)
     /*  */
 676 {
 677         struct vm_area_struct * vma;
 678         unsigned long start = (unsigned long) addr;
 679 
 680         /* If the current user space is mapped to kernel space (for the
 681          * case where we use a fake user buffer with get_fs/set_fs()) we
 682          * don't expect to find the address in the user vm map.
 683          */
 684         if (get_fs() == get_ds())
 685                 return 0;
 686 
 687         vma = find_vma(current, start);
 688         if (!vma)
 689                 goto bad_area;
 690         if (vma->vm_start <= start)
 691                 goto good_area;
 692         if (!(vma->vm_flags & VM_GROWSDOWN))
 693                 goto bad_area;
 694         if (vma->vm_end - start > current->rlim[RLIMIT_STACK].rlim_cur)
 695                 goto bad_area;
 696 
 697 good_area:
 698         if (type == VERIFY_WRITE)
 699                 goto check_write;
 700         for (;;) {
 701                 struct vm_area_struct * next;
 702                 if (!(vma->vm_flags & VM_READ))
 703                         goto bad_area;
 704                 if (vma->vm_end - start >= size)
 705                         return 0;
 706                 next = vma->vm_next;
 707                 if (!next || vma->vm_end != next->vm_start)
 708                         goto bad_area;
 709                 vma = next;
 710         }
 711 
 712 check_write:
 713         if (!(vma->vm_flags & VM_WRITE))
 714                 goto bad_area;
 715         if (!wp_works_ok)
 716                 goto check_wp_fault_by_hand;
 717         for (;;) {
 718                 if (vma->vm_end - start >= size)
 719                         break;
 720                 if (!vma->vm_next || vma->vm_end != vma->vm_next->vm_start)
 721                         goto bad_area;
 722                 vma = vma->vm_next;
 723                 if (!(vma->vm_flags & VM_WRITE))
 724                         goto bad_area;
 725         }
 726         return 0;
 727 
 728 check_wp_fault_by_hand:
 729         size--;
 730         size += start & ~PAGE_MASK;
 731         size >>= PAGE_SHIFT;
 732         start &= PAGE_MASK;
 733 
 734         for (;;) {
 735                 do_wp_page(current, vma, start, 1);
 736                 if (!size)
 737                         break;
 738                 size--;
 739                 start += PAGE_SIZE;
 740                 if (start < vma->vm_end)
 741                         continue;
 742                 vma = vma->vm_next;
 743                 if (!vma || vma->vm_start != start)
 744                         goto bad_area;
 745                 if (!(vma->vm_flags & VM_WRITE))
 746                         goto bad_area;;
 747         }
 748         return 0;
 749 
 750 bad_area:
 751         return -EFAULT;
 752 }
 753 
 754 static inline void get_empty_page(struct task_struct * tsk, struct vm_area_struct * vma, pte_t * page_table)
     /*  */
 755 {
 756         unsigned long tmp;
 757 
 758         if (!(tmp = get_free_page(GFP_KERNEL))) {
 759                 oom(tsk);
 760                 put_page(page_table, BAD_PAGE);
 761                 return;
 762         }
 763         put_page(page_table, pte_mkwrite(mk_pte(tmp, vma->vm_page_prot)));
 764 }
 765 
 766 /*
 767  * try_to_share() checks the page at address "address" in the task "p",
 768  * to see if it exists, and if it is clean. If so, share it with the current
 769  * task.
 770  *
 771  * NOTE! This assumes we have checked that p != current, and that they
 772  * share the same inode and can generally otherwise be shared.
 773  */
 774 static int try_to_share(unsigned long to_address, struct vm_area_struct * to_area,
     /*  */
 775         unsigned long from_address, struct vm_area_struct * from_area,
 776         unsigned long newpage)
 777 {
 778         pgd_t * from_dir, * to_dir;
 779         pmd_t * from_middle, * to_middle;
 780         pte_t * from_table, * to_table;
 781         pte_t from, to;
 782 
 783         from_dir = pgd_offset(from_area->vm_mm,from_address);
 784 /* is there a page-directory at from? */
 785         if (pgd_none(*from_dir))
 786                 return 0;
 787         if (pgd_bad(*from_dir)) {
 788                 printk("try_to_share: bad page directory %08lx\n", pgd_val(*from_dir));
 789                 pgd_clear(from_dir);
 790                 return 0;
 791         }
 792         from_middle = pmd_offset(from_dir, from_address);
 793 /* is there a mid-directory at from? */
 794         if (pmd_none(*from_middle))
 795                 return 0;
 796         if (pmd_bad(*from_middle)) {
 797                 printk("try_to_share: bad mid directory %08lx\n", pmd_val(*from_middle));
 798                 pmd_clear(from_middle);
 799                 return 0;
 800         }
 801         from_table = pte_offset(from_middle, from_address);
 802         from = *from_table;
 803 /* is the page present? */
 804         if (!pte_present(from))
 805                 return 0;
 806 /* if it is dirty it must be from a shared mapping to be shared */
 807         if (pte_dirty(from)) {
 808                 if (!(from_area->vm_flags & VM_SHARED))
 809                         return 0;
 810         }
 811 /* is the page reasonable at all? */
 812         if (pte_page(from) >= high_memory)
 813                 return 0;
 814         if (mem_map[MAP_NR(pte_page(from))].reserved)
 815                 return 0;
 816 /* is the destination ok? */
 817         to_dir = pgd_offset(to_area->vm_mm,to_address);
 818 /* is there a page-directory at to? */
 819         if (pgd_none(*to_dir))
 820                 return 0;
 821         if (pgd_bad(*to_dir)) {
 822                 printk("try_to_share: bad page directory %08lx\n", pgd_val(*to_dir));
 823                 return 0;
 824         }
 825         to_middle = pmd_offset(to_dir, to_address);
 826 /* is there a mid-directory at to? */
 827         if (pmd_none(*to_middle))
 828                 return 0;
 829         if (pmd_bad(*to_middle)) {
 830                 printk("try_to_share: bad mid directory %08lx\n", pmd_val(*to_middle));
 831                 return 0;
 832         }
 833         to_table = pte_offset(to_middle, to_address);
 834         to = *to_table;
 835         if (!pte_none(to))
 836                 return 0;
 837 /* do we copy? */
 838         if (newpage) {
 839                 /* if it's in the swap cache, it's dirty by implication */
 840                 /* so we can't use it if it's not from a shared mapping */
 841                 if (in_swap_cache(pte_page(from))) {
 842                         if (!(from_area->vm_flags & VM_SHARED))
 843                                 return 0;
 844                 }
 845                 copy_page(pte_page(from), newpage);
 846                 set_pte(to_table, mk_pte(newpage, to_area->vm_page_prot));
 847                 return 1;
 848         }
 849 /*
 850  * do a final swap-cache test before sharing them: if it's in the swap
 851  * cache, we have to remove it now, as we get two pointers to the same
 852  * physical page and the cache can't handle it. Mark the original dirty.
 853  *
 854  * NOTE! Even if "from" is dirty, "to" will be clean: if we get here
 855  * with a dirty "from", the from-mapping is a shared map, so we can trust
 856  * the page contents to be up-to-date
 857  */
 858         if (in_swap_cache(pte_page(from))) {
 859                 if (!(from_area->vm_flags & VM_SHARED))
 860                         return 0;
 861                 set_pte(from_table, pte_mkdirty(from));
 862                 delete_from_swap_cache(pte_page(from));
 863         }
 864         mem_map[MAP_NR(pte_page(from))].count++;
 865         set_pte(to_table, mk_pte(pte_page(from), to_area->vm_page_prot));
 866 /* Check if we need to do anything at all to the 'from' field */
 867         if (!pte_write(from))
 868                 return 1;
 869         if (from_area->vm_flags & VM_SHARED)
 870                 return 1;
 871 /* ok, need to mark it read-only, so invalidate any possible old TB entry */
 872         set_pte(from_table, pte_wrprotect(from));
 873         invalidate();
 874         return 1;
 875 }
 876 
 877 /*
 878  * share_page() tries to find a process that could share a page with
 879  * the current one.
 880  *
 881  * We first check if it is at all feasible by checking inode->i_count.
 882  * It should be >1 if there are other tasks sharing this inode.
 883  */
 884 static int share_page(struct vm_area_struct * area, unsigned long address,
     /*  */
 885         int write_access, unsigned long newpage)
 886 {
 887         struct inode * inode;
 888         unsigned long offset;
 889         unsigned long from_address;
 890         unsigned long give_page;
 891         struct vm_area_struct * mpnt;
 892 
 893         if (!area || !(inode = area->vm_inode) || inode->i_count < 2)
 894                 return 0;
 895         /* do we need to copy or can we just share? */
 896         give_page = 0;
 897         if (write_access && !(area->vm_flags & VM_SHARED)) {
 898                 if (!newpage)
 899                         return 0;
 900                 give_page = newpage;
 901         }
 902         offset = address - area->vm_start + area->vm_offset;
 903         /* See if there is something in the VM we can share pages with. */
 904         /* Traverse the entire circular i_mmap list, except `area' itself. */
 905         for (mpnt = area->vm_next_share; mpnt != area; mpnt = mpnt->vm_next_share) {
 906                 /* must be same inode */
 907                 if (mpnt->vm_inode != inode) {
 908                         printk("Aiee! Corrupt vm_area_struct i_mmap ring\n");
 909                         break;  
 910                 }
 911                 /* offsets must be mutually page-aligned */
 912                 if ((mpnt->vm_offset ^ area->vm_offset) & ~PAGE_MASK)
 913                         continue;
 914                 /* the other area must actually cover the wanted page.. */
 915                 from_address = offset + mpnt->vm_start - mpnt->vm_offset;
 916                 if (from_address < mpnt->vm_start || from_address >= mpnt->vm_end)
 917                         continue;
 918                 /* .. NOW we can actually try to use the same physical page */
 919                 if (!try_to_share(address, area, from_address, mpnt, give_page))
 920                         continue;
 921                 /* free newpage if we never used it.. */
 922                 if (give_page || !newpage)
 923                         return 1;
 924                 free_page(newpage);
 925                 return 1;
 926         }
 927         return 0;
 928 }
 929 
 930 /*
 931  * This function tries to find a page that is shared with the buffer cache,
 932  * and if so it moves the buffer cache to a new location.
 933  *
 934  * It returns non-zero if we used up the "new_page" page.
 935  */
 936 static int unshare(struct vm_area_struct *vma, unsigned long address, unsigned long new_page)
     /*  */
 937 {
 938         pgd_t *page_dir;
 939         pmd_t *page_middle;
 940         pte_t *page_table, pte;
 941         unsigned long old_page;
 942         struct buffer_head * bh, * tmp;
 943 
 944         page_dir = pgd_offset(vma->vm_mm, address);
 945         if (pgd_none(*page_dir))
 946                 return 0;
 947         if (pgd_bad(*page_dir)) {
 948                 printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
 949                 pgd_clear(page_dir);
 950                 return 0;
 951         }
 952         page_middle = pmd_offset(page_dir, address);
 953         if (pmd_none(*page_middle))
 954                 return 0;
 955         if (pmd_bad(*page_middle)) {
 956                 printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
 957                 pmd_clear(page_middle);
 958                 return 0;
 959         }
 960         page_table = pte_offset(page_middle, address);
 961         pte = *page_table;
 962         if (!pte_present(pte))
 963                 return 0;
 964         old_page = pte_page(pte);
 965         if (MAP_NR(old_page) > MAP_NR(high_memory))
 966                 return 0;
 967         address &= ~PAGE_MASK;
 968         memset((void *) (old_page + address), 0, PAGE_SIZE - address);
 969         bh = buffer_pages[MAP_NR(old_page)];
 970         if (!bh)
 971                 return 0;
 972         if (!new_page) {
 973                 printk("Aieee... unshare(): no page available\n");
 974                 return 0;
 975         }
 976         buffer_pages[MAP_NR(old_page)] = NULL;
 977         copy_page(old_page, new_page);
 978         free_page(old_page);
 979         old_page -= new_page;
 980         buffer_pages[MAP_NR(new_page)] = bh;
 981         tmp = bh;
 982         do {
 983                 tmp->b_data -= old_page;
 984                 tmp = tmp->b_this_page;
 985         } while (tmp != bh);
 986         return 1;
 987 }
 988 
 989 /*
 990  * Handle all mappings that got truncated by a "truncate()"
 991  * system call.
 992  *
 993  * NOTE! We have to be ready to update the memory sharing
 994  * between the file and the memory map for a potential last
 995  * incomplete page.  Ugly, but necessary.
 996  */
 997 void vmtruncate(struct inode * inode, unsigned long offset)
     /*  */
 998 {
 999         unsigned long page;
1000         struct vm_area_struct * mpnt;
1001 
1002         if (!inode->i_mmap)
1003                 return;
1004         page = __get_free_page(GFP_KERNEL);
1005         mpnt = inode->i_mmap;
1006         if (!mpnt) {
1007                 free_page(page);
1008                 return;
1009         }
1010         do {
1011                 unsigned long start = mpnt->vm_start;
1012                 unsigned long len = mpnt->vm_end - start;
1013                 unsigned long diff;
1014 
1015                 /* mapping wholly truncated? */
1016                 if (mpnt->vm_offset >= offset) {
1017                         zap_page_range(mpnt->vm_mm, start, len);
1018                         continue;
1019                 }
1020                 /* mapping wholly unaffected? */
1021                 diff = offset - mpnt->vm_offset;
1022                 if (diff >= len)
1023                         continue;
1024                 /* Ok, partially affected.. */
1025                 start += diff;
1026                 len = (len - diff) & PAGE_MASK;
1027                 /* Ugh, here comes the _really_ ugly part.. */
1028                 if (start & ~PAGE_MASK) {
1029                         if (unshare(mpnt, start, page))
1030                                 page = 0;
1031                         start = (start + ~PAGE_MASK) & PAGE_MASK;
1032                 }
1033                 zap_page_range(mpnt->vm_mm, start, len);
1034         } while ((mpnt = mpnt->vm_next_share) != inode->i_mmap);
1035         free_page(page);
1036 }
1037 
1038 /*
1039  * fill in an empty page-table if none exists.
1040  */
1041 static inline pte_t * get_empty_pgtable(struct task_struct * tsk,unsigned long address)
     /*  */
1042 {
1043         pgd_t *pgd;
1044         pmd_t *pmd;
1045         pte_t *pte;
1046 
1047         pgd = pgd_offset(tsk->mm, address);
1048         pmd = pmd_alloc(pgd, address);
1049         if (!pmd) {
1050                 oom(tsk);
1051                 return NULL;
1052         }
1053         pte = pte_alloc(pmd, address);
1054         if (!pte) {
1055                 oom(tsk);
1056                 return NULL;
1057         }
1058         return pte;
1059 }
1060 
1061 static inline void do_swap_page(struct task_struct * tsk, 
     /*  */
1062         struct vm_area_struct * vma, unsigned long address,
1063         pte_t * page_table, pte_t entry, int write_access)
1064 {
1065         pte_t page;
1066 
1067         if (!vma->vm_ops || !vma->vm_ops->swapin) {
1068                 swap_in(tsk, vma, page_table, pte_val(entry), write_access);
1069                 return;
1070         }
1071         page = vma->vm_ops->swapin(vma, address - vma->vm_start + vma->vm_offset, pte_val(entry));
1072         if (pte_val(*page_table) != pte_val(entry)) {
1073                 free_page(pte_page(page));
1074                 return;
1075         }
1076         if (mem_map[MAP_NR(pte_page(page))].count > 1 && !(vma->vm_flags & VM_SHARED))
1077                 page = pte_wrprotect(page);
1078         ++vma->vm_mm->rss;
1079         ++tsk->maj_flt;
1080         set_pte(page_table, page);
1081         return;
1082 }
1083 
1084 /*
1085  * do_no_page() tries to create a new page mapping. It aggressively
1086  * tries to share with existing pages, but makes a separate copy if
1087  * the "write_access" parameter is true in order to avoid the next
1088  * page fault.
1089  */
1090 void do_no_page(struct task_struct * tsk, struct vm_area_struct * vma,
     /*  */
1091         unsigned long address, int write_access)
1092 {
1093         pte_t * page_table;
1094         pte_t entry;
1095         unsigned long page;
1096 
1097         page_table = get_empty_pgtable(tsk, address);
1098         if (!page_table)
1099                 return;
1100         entry = *page_table;
1101         if (pte_present(entry))
1102                 return;
1103         if (!pte_none(entry)) {
1104                 do_swap_page(tsk, vma, address, page_table, entry, write_access);
1105                 return;
1106         }
1107         address &= PAGE_MASK;
1108         if (!vma->vm_ops || !vma->vm_ops->nopage) {
1109                 ++vma->vm_mm->rss;
1110                 ++tsk->min_flt;
1111                 get_empty_page(tsk, vma, page_table);
1112                 return;
1113         }
1114         page = __get_free_page(GFP_KERNEL);
1115         if (share_page(vma, address, write_access, page)) {
1116                 ++vma->vm_mm->rss;
1117                 ++tsk->min_flt;
1118                 return;
1119         }
1120         if (!page) {
1121                 oom(tsk);
1122                 put_page(page_table, BAD_PAGE);
1123                 return;
1124         }
1125         ++tsk->maj_flt;
1126         ++vma->vm_mm->rss;
1127         /*
1128          * The fourth argument is "no_share", which tells the low-level code
1129          * to copy, not share the page even if sharing is possible.  It's
1130          * essentially an early COW detection 
1131          */
1132         page = vma->vm_ops->nopage(vma, address, page,
1133                 write_access && !(vma->vm_flags & VM_SHARED));
1134         if (share_page(vma, address, write_access, 0)) {
1135                 free_page(page);
1136                 return;
1137         }
1138         /*
1139          * This silly early PAGE_DIRTY setting removes a race
1140          * due to the bad i386 page protection. But it's valid
1141          * for other architectures too.
1142          *
1143          * Note that if write_access is true, we either now have
1144          * a exclusive copy of the page, or this is a shared mapping,
1145          * so we can make it writable and dirty to avoid having to
1146          * handle that later.
1147          */
1148         entry = mk_pte(page, vma->vm_page_prot);
1149         if (write_access) {
1150                 entry = pte_mkwrite(pte_mkdirty(entry));
1151         } else if (mem_map[MAP_NR(page)].count > 1 && !(vma->vm_flags & VM_SHARED))
1152                 entry = pte_wrprotect(entry);
1153         put_page(page_table, entry);
1154 }
1155 
1156 /*
1157  * The above separate functions for the no-page and wp-page
1158  * cases will go away (they mostly do the same thing anyway),
1159  * and we'll instead use only a general "handle_mm_fault()".
1160  *
1161  * These routines also need to handle stuff like marking pages dirty
1162  * and/or accessed for architectures that don't do it in hardware (most
1163  * RISC architectures).  The early dirtying is also good on the i386.
1164  *
1165  * There is also a hook called "update_mmu_cache()" that architectures
1166  * with external mmu caches can use to update those (ie the Sparc or
1167  * PowerPC hashed page tables that act as extended TLBs).
1168  */
1169 static inline void handle_pte_fault(struct vm_area_struct * vma, unsigned long address,
     /*  */
1170         int write_access, pte_t * pte)
1171 {
1172         if (!pte_present(*pte)) {
1173                 do_no_page(current, vma, address, write_access);
1174                 return;
1175         }
1176         set_pte(pte, pte_mkyoung(*pte));
1177         if (!write_access)
1178                 return;
1179         if (pte_write(*pte)) {
1180                 set_pte(pte, pte_mkdirty(*pte));
1181                 return;
1182         }
1183         do_wp_page(current, vma, address, write_access);
1184 }
1185 
1186 void handle_mm_fault(struct vm_area_struct * vma, unsigned long address,
     /*  */
1187         int write_access)
1188 {
1189         pgd_t *pgd;
1190         pmd_t *pmd;
1191         pte_t *pte;
1192 
1193         pgd = pgd_offset(vma->vm_mm, address);
1194         pmd = pmd_alloc(pgd, address);
1195         if (!pmd)
1196                 goto no_memory;
1197         pte = pte_alloc(pmd, address);
1198         if (!pte)
1199                 goto no_memory;
1200         handle_pte_fault(vma, address, write_access, pte);
1201         update_mmu_cache(vma, address, *pte);
1202         return;
1203 no_memory:
1204         oom(current);
1205 }
/* */
root/mm/memory.c

DEFINITIONS