root/mm/memory.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. oom
  2. free_one_pmd
  3. free_one_pgd
  4. clear_page_tables
  5. free_page_tables
  6. new_page_tables
  7. copy_one_pte
  8. copy_pte_range
  9. copy_pmd_range
  10. copy_page_range
  11. forget_pte
  12. zap_pte_range
  13. zap_pmd_range
  14. zap_page_range
  15. zeromap_pte_range
  16. zeromap_pmd_range
  17. zeromap_page_range
  18. remap_pte_range
  19. remap_pmd_range
  20. remap_page_range
  21. put_page
  22. put_dirty_page
  23. do_wp_page
  24. verify_area
  25. get_empty_page
  26. try_to_share
  27. share_page
  28. unshare
  29. vmtruncate
  30. get_empty_pgtable
  31. do_swap_page
  32. do_no_page
  33. handle_pte_fault
  34. handle_mm_fault

   1 /*
   2  *  linux/mm/memory.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  */
   6 
   7 /*
   8  * demand-loading started 01.12.91 - seems it is high on the list of
   9  * things wanted, and it should be easy to implement. - Linus
  10  */
  11 
  12 /*
  13  * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
  14  * pages started 02.12.91, seems to work. - Linus.
  15  *
  16  * Tested sharing by executing about 30 /bin/sh: under the old kernel it
  17  * would have taken more than the 6M I have free, but it worked well as
  18  * far as I could see.
  19  *
  20  * Also corrected some "invalidate()"s - I wasn't doing enough of them.
  21  */
  22 
  23 /*
  24  * Real VM (paging to/from disk) started 18.12.91. Much more work and
  25  * thought has to go into this. Oh, well..
  26  * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
  27  *              Found it. Everything seems to work now.
  28  * 20.12.91  -  Ok, making the swap-device changeable like the root.
  29  */
  30 
  31 /*
  32  * 05.04.94  -  Multi-page memory management added for v1.1.
  33  *              Idea by Alex Bligh (alex@cconcepts.co.uk)
  34  */
  35 
  36 #include <linux/signal.h>
  37 #include <linux/sched.h>
  38 #include <linux/head.h>
  39 #include <linux/kernel.h>
  40 #include <linux/errno.h>
  41 #include <linux/string.h>
  42 #include <linux/types.h>
  43 #include <linux/ptrace.h>
  44 #include <linux/mman.h>
  45 #include <linux/mm.h>
  46 
  47 #include <asm/system.h>
  48 #include <asm/segment.h>
  49 #include <asm/pgtable.h>
  50 
  51 unsigned long high_memory = 0;
  52 
  53 /*
  54  * The free_area_list arrays point to the queue heads of the free areas
  55  * of different sizes
  56  */
  57 int nr_swap_pages = 0;
  58 int nr_free_pages = 0;
  59 struct mem_list free_area_list[NR_MEM_LISTS];
  60 unsigned char * free_area_map[NR_MEM_LISTS];
  61 
  62 #define copy_page(from,to) memcpy((void *) to, (void *) from, PAGE_SIZE)
  63 
  64 #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE)
  65 
  66 mem_map_t * mem_map = NULL;
  67 
  68 /*
  69  * oom() prints a message (so that the user knows why the process died),
  70  * and gives the process an untrappable SIGKILL.
  71  */
  72 void oom(struct task_struct * task)
     /* [previous][next][first][last][top][bottom][index][help] */
  73 {
  74         printk("\nOut of memory for %s.\n", current->comm);
  75         task->sig->action[SIGKILL-1].sa_handler = NULL;
  76         task->blocked &= ~(1<<(SIGKILL-1));
  77         send_sig(SIGKILL,task,1);
  78 }
  79 
  80 /*
  81  * Note: this doesn't free the actual pages themselves. That
  82  * has been handled earlier when unmapping all the memory regions.
  83  */
  84 static inline void free_one_pmd(pmd_t * dir)
     /* [previous][next][first][last][top][bottom][index][help] */
  85 {
  86         pte_t * pte;
  87 
  88         if (pmd_none(*dir))
  89                 return;
  90         if (pmd_bad(*dir)) {
  91                 printk("free_one_pmd: bad directory entry %08lx\n", pmd_val(*dir));
  92                 pmd_clear(dir);
  93                 return;
  94         }
  95         pte = pte_offset(dir, 0);
  96         pmd_clear(dir);
  97         pte_free(pte);
  98 }
  99 
 100 static inline void free_one_pgd(pgd_t * dir)
     /* [previous][next][first][last][top][bottom][index][help] */
 101 {
 102         pmd_t * pmd;
 103 
 104         if (pgd_none(*dir))
 105                 return;
 106         if (pgd_bad(*dir)) {
 107                 printk("free_one_pgd: bad directory entry %08lx\n", pgd_val(*dir));
 108                 pgd_clear(dir);
 109                 return;
 110         }
 111         pmd = pmd_offset(dir, 0);
 112         pgd_clear(dir);
 113         if (!pmd_inuse(pmd)) {
 114                 int j;
 115                 for (j = 0; j < PTRS_PER_PMD ; j++)
 116                         free_one_pmd(pmd+j);
 117         }
 118         pmd_free(pmd);
 119 }
 120         
 121 /*
 122  * This function clears all user-level page tables of a process - this
 123  * is needed by execve(), so that old pages aren't in the way.
 124  */
 125 void clear_page_tables(struct task_struct * tsk)
     /* [previous][next][first][last][top][bottom][index][help] */
 126 {
 127         int i;
 128         pgd_t * page_dir;
 129 
 130         page_dir = tsk->mm->pgd;
 131         if (!page_dir || page_dir == swapper_pg_dir) {
 132                 printk("%s trying to clear kernel page-directory: not good\n", tsk->comm);
 133                 return;
 134         }
 135         for (i = 0 ; i < USER_PTRS_PER_PGD ; i++)
 136                 free_one_pgd(page_dir + i);
 137         invalidate();
 138 }
 139 
 140 /*
 141  * This function frees up all page tables of a process when it exits. It
 142  * is the same as "clear_page_tables()", except it also changes the process'
 143  * page table directory to the kernel page tables and then frees the old
 144  * page table directory.
 145  */
 146 void free_page_tables(struct task_struct * tsk)
     /* [previous][next][first][last][top][bottom][index][help] */
 147 {
 148         int i;
 149         pgd_t * page_dir;
 150 
 151         page_dir = tsk->mm->pgd;
 152         if (!page_dir || page_dir == swapper_pg_dir) {
 153                 printk("%s trying to free kernel page-directory: not good\n", tsk->comm);
 154                 return;
 155         }
 156         SET_PAGE_DIR(tsk, swapper_pg_dir);
 157         tsk->mm->pgd = swapper_pg_dir;  /* or else... */
 158         for (i = 0 ; i < PTRS_PER_PGD ; i++)
 159                 free_one_pgd(page_dir + i);
 160         pgd_free(page_dir);
 161         invalidate();
 162 }
 163 
 164 int new_page_tables(struct task_struct * tsk)
     /* [previous][next][first][last][top][bottom][index][help] */
 165 {
 166         pgd_t * page_dir, * new_pg;
 167         int i;
 168 
 169         if (!(new_pg = pgd_alloc()))
 170                 return -ENOMEM;
 171         page_dir = pgd_offset(&init_mm, 0);
 172         for (i = USER_PTRS_PER_PGD ; i < PTRS_PER_PGD ; i++)
 173                 new_pg[i] = page_dir[i];
 174         SET_PAGE_DIR(tsk, new_pg);
 175         tsk->mm->pgd = new_pg;
 176         return 0;
 177 }
 178 
 179 static inline void copy_one_pte(pte_t * old_pte, pte_t * new_pte)
     /* [previous][next][first][last][top][bottom][index][help] */
 180 {
 181         pte_t pte = *old_pte;
 182 
 183         if (pte_none(pte))
 184                 return;
 185         if (!pte_present(pte)) {
 186                 swap_duplicate(pte_val(pte));
 187                 set_pte(new_pte, pte);
 188                 return;
 189         }
 190         if (pte_page(pte) > high_memory || mem_map[MAP_NR(pte_page(pte))].reserved) {
 191                 set_pte(new_pte, pte);
 192                 return;
 193         }
 194         if (pte_cow(pte))
 195                 pte = pte_wrprotect(pte);
 196         if (delete_from_swap_cache(pte_page(pte)))
 197                 pte = pte_mkdirty(pte);
 198         set_pte(new_pte, pte_mkold(pte));
 199         set_pte(old_pte, pte);
 200         mem_map[MAP_NR(pte_page(pte))].count++;
 201 }
 202 
 203 static inline int copy_pte_range(pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long address, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 204 {
 205         pte_t * src_pte, * dst_pte;
 206         unsigned long end;
 207 
 208         if (pmd_none(*src_pmd))
 209                 return 0;
 210         if (pmd_bad(*src_pmd)) {
 211                 printk("copy_pte_range: bad pmd (%08lx)\n", pmd_val(*src_pmd));
 212                 pmd_clear(src_pmd);
 213                 return 0;
 214         }
 215         src_pte = pte_offset(src_pmd, address);
 216         if (pmd_none(*dst_pmd)) {
 217                 if (!pte_alloc(dst_pmd, 0))
 218                         return -ENOMEM;
 219         }
 220         dst_pte = pte_offset(dst_pmd, address);
 221         address &= ~PMD_MASK;
 222         end = address + size;
 223         if (end >= PMD_SIZE)
 224                 end = PMD_SIZE;
 225         do {
 226                 /* I would like to switch arguments here, to make it
 227                  * consistent with copy_xxx_range and memcpy syntax.
 228                  */
 229                 copy_one_pte(src_pte++, dst_pte++);
 230                 address += PAGE_SIZE;
 231         } while (address < end);
 232         return 0;
 233 }
 234 
 235 static inline int copy_pmd_range(pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long address, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 236 {
 237         pmd_t * src_pmd, * dst_pmd;
 238         unsigned long end;
 239         int error = 0;
 240 
 241         if (pgd_none(*src_pgd))
 242                 return 0;
 243         if (pgd_bad(*src_pgd)) {
 244                 printk("copy_pmd_range: bad pgd (%08lx)\n", pgd_val(*src_pgd));
 245                 pgd_clear(src_pgd);
 246                 return 0;
 247         }
 248         src_pmd = pmd_offset(src_pgd, address);
 249         if (pgd_none(*dst_pgd)) {
 250                 if (!pmd_alloc(dst_pgd, 0))
 251                         return -ENOMEM;
 252         }
 253         dst_pmd = pmd_offset(dst_pgd, address);
 254         address &= ~PGDIR_MASK;
 255         end = address + size;
 256         if (end > PGDIR_SIZE)
 257                 end = PGDIR_SIZE;
 258         do {
 259                 error = copy_pte_range(dst_pmd++, src_pmd++, address, end - address);
 260                 if (error)
 261                         break;
 262                 address = (address + PMD_SIZE) & PMD_MASK; 
 263         } while (address < end);
 264         return error;
 265 }
 266 
 267 /*
 268  * copy one vm_area from one task to the other. Assumes the page tables
 269  * already present in the new task to be cleared in the whole range
 270  * covered by this vma.
 271  */
 272 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
     /* [previous][next][first][last][top][bottom][index][help] */
 273                         struct vm_area_struct *vma)
 274 {
 275         pgd_t * src_pgd, * dst_pgd;
 276         unsigned long address = vma->vm_start;
 277         unsigned long end = vma->vm_end;
 278         int error = 0;
 279 
 280         src_pgd = pgd_offset(src, address);
 281         dst_pgd = pgd_offset(dst, address);
 282         while (address < end) {
 283                 error = copy_pmd_range(dst_pgd++, src_pgd++, address, end - address);
 284                 if (error)
 285                         break;
 286                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 287         }
 288         invalidate();
 289         return error;
 290 }
 291 
 292 static inline void forget_pte(pte_t page)
     /* [previous][next][first][last][top][bottom][index][help] */
 293 {
 294         if (pte_none(page))
 295                 return;
 296         if (pte_present(page)) {
 297                 free_page(pte_page(page));
 298                 if (mem_map[MAP_NR(pte_page(page))].reserved)
 299                         return;
 300                 if (current->mm->rss <= 0)
 301                         return;
 302                 current->mm->rss--;
 303                 return;
 304         }
 305         swap_free(pte_val(page));
 306 }
 307 
 308 static inline void zap_pte_range(pmd_t * pmd, unsigned long address, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 309 {
 310         pte_t * pte;
 311         unsigned long end;
 312 
 313         if (pmd_none(*pmd))
 314                 return;
 315         if (pmd_bad(*pmd)) {
 316                 printk("zap_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
 317                 pmd_clear(pmd);
 318                 return;
 319         }
 320         pte = pte_offset(pmd, address);
 321         address &= ~PMD_MASK;
 322         end = address + size;
 323         if (end >= PMD_SIZE)
 324                 end = PMD_SIZE;
 325         do {
 326                 pte_t page = *pte;
 327                 pte_clear(pte);
 328                 forget_pte(page);
 329                 address += PAGE_SIZE;
 330                 pte++;
 331         } while (address < end);
 332 }
 333 
 334 static inline void zap_pmd_range(pgd_t * dir, unsigned long address, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 335 {
 336         pmd_t * pmd;
 337         unsigned long end;
 338 
 339         if (pgd_none(*dir))
 340                 return;
 341         if (pgd_bad(*dir)) {
 342                 printk("zap_pmd_range: bad pgd (%08lx)\n", pgd_val(*dir));
 343                 pgd_clear(dir);
 344                 return;
 345         }
 346         pmd = pmd_offset(dir, address);
 347         address &= ~PGDIR_MASK;
 348         end = address + size;
 349         if (end > PGDIR_SIZE)
 350                 end = PGDIR_SIZE;
 351         do {
 352                 zap_pte_range(pmd, address, end - address);
 353                 address = (address + PMD_SIZE) & PMD_MASK; 
 354                 pmd++;
 355         } while (address < end);
 356 }
 357 
 358 /*
 359  * remove user pages in a given range.
 360  */
 361 int zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 362 {
 363         pgd_t * dir;
 364         unsigned long end = address + size;
 365 
 366         dir = pgd_offset(mm, address);
 367         while (address < end) {
 368                 zap_pmd_range(dir, address, end - address);
 369                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 370                 dir++;
 371         }
 372         invalidate();
 373         return 0;
 374 }
 375 
 376 static inline void zeromap_pte_range(pte_t * pte, unsigned long address, unsigned long size, pte_t zero_pte)
     /* [previous][next][first][last][top][bottom][index][help] */
 377 {
 378         unsigned long end;
 379 
 380         address &= ~PMD_MASK;
 381         end = address + size;
 382         if (end > PMD_SIZE)
 383                 end = PMD_SIZE;
 384         do {
 385                 pte_t oldpage = *pte;
 386                 set_pte(pte, zero_pte);
 387                 forget_pte(oldpage);
 388                 address += PAGE_SIZE;
 389                 pte++;
 390         } while (address < end);
 391 }
 392 
 393 static inline int zeromap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size, pte_t zero_pte)
     /* [previous][next][first][last][top][bottom][index][help] */
 394 {
 395         unsigned long end;
 396 
 397         address &= ~PGDIR_MASK;
 398         end = address + size;
 399         if (end > PGDIR_SIZE)
 400                 end = PGDIR_SIZE;
 401         do {
 402                 pte_t * pte = pte_alloc(pmd, address);
 403                 if (!pte)
 404                         return -ENOMEM;
 405                 zeromap_pte_range(pte, address, end - address, zero_pte);
 406                 address = (address + PMD_SIZE) & PMD_MASK;
 407                 pmd++;
 408         } while (address < end);
 409         return 0;
 410 }
 411 
 412 int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot)
     /* [previous][next][first][last][top][bottom][index][help] */
 413 {
 414         int error = 0;
 415         pgd_t * dir;
 416         unsigned long end = address + size;
 417         pte_t zero_pte;
 418 
 419         zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE, prot));
 420         dir = pgd_offset(current->mm, address);
 421         while (address < end) {
 422                 pmd_t *pmd = pmd_alloc(dir, address);
 423                 error = -ENOMEM;
 424                 if (!pmd)
 425                         break;
 426                 error = zeromap_pmd_range(pmd, address, end - address, zero_pte);
 427                 if (error)
 428                         break;
 429                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 430                 dir++;
 431         }
 432         invalidate();
 433         return error;
 434 }
 435 
 436 /*
 437  * maps a range of physical memory into the requested pages. the old
 438  * mappings are removed. any references to nonexistent pages results
 439  * in null mappings (currently treated as "copy-on-access")
 440  */
 441 static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size,
     /* [previous][next][first][last][top][bottom][index][help] */
 442         unsigned long offset, pgprot_t prot)
 443 {
 444         unsigned long end;
 445 
 446         address &= ~PMD_MASK;
 447         end = address + size;
 448         if (end > PMD_SIZE)
 449                 end = PMD_SIZE;
 450         do {
 451                 pte_t oldpage = *pte;
 452                 pte_clear(pte);
 453                 if (offset >= high_memory || mem_map[MAP_NR(offset)].reserved)
 454                         set_pte(pte, mk_pte(offset, prot));
 455                 forget_pte(oldpage);
 456                 address += PAGE_SIZE;
 457                 offset += PAGE_SIZE;
 458                 pte++;
 459         } while (address < end);
 460 }
 461 
 462 static inline int remap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size,
     /* [previous][next][first][last][top][bottom][index][help] */
 463         unsigned long offset, pgprot_t prot)
 464 {
 465         unsigned long end;
 466 
 467         address &= ~PGDIR_MASK;
 468         end = address + size;
 469         if (end > PGDIR_SIZE)
 470                 end = PGDIR_SIZE;
 471         offset -= address;
 472         do {
 473                 pte_t * pte = pte_alloc(pmd, address);
 474                 if (!pte)
 475                         return -ENOMEM;
 476                 remap_pte_range(pte, address, end - address, address + offset, prot);
 477                 address = (address + PMD_SIZE) & PMD_MASK;
 478                 pmd++;
 479         } while (address < end);
 480         return 0;
 481 }
 482 
 483 int remap_page_range(unsigned long from, unsigned long offset, unsigned long size, pgprot_t prot)
     /* [previous][next][first][last][top][bottom][index][help] */
 484 {
 485         int error = 0;
 486         pgd_t * dir;
 487         unsigned long end = from + size;
 488 
 489         offset -= from;
 490         dir = pgd_offset(current->mm, from);
 491         while (from < end) {
 492                 pmd_t *pmd = pmd_alloc(dir, from);
 493                 error = -ENOMEM;
 494                 if (!pmd)
 495                         break;
 496                 error = remap_pmd_range(pmd, from, end - from, offset + from, prot);
 497                 if (error)
 498                         break;
 499                 from = (from + PGDIR_SIZE) & PGDIR_MASK;
 500                 dir++;
 501         }
 502         invalidate();
 503         return error;
 504 }
 505 
 506 /*
 507  * sanity-check function..
 508  */
 509 static void put_page(pte_t * page_table, pte_t pte)
     /* [previous][next][first][last][top][bottom][index][help] */
 510 {
 511         if (!pte_none(*page_table)) {
 512                 printk("put_page: page already exists %08lx\n", pte_val(*page_table));
 513                 free_page(pte_page(pte));
 514                 return;
 515         }
 516 /* no need for invalidate */
 517         *page_table = pte;
 518 }
 519 
 520 /*
 521  * This routine is used to map in a page into an address space: needed by
 522  * execve() for the initial stack and environment pages.
 523  */
 524 unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsigned long address)
     /* [previous][next][first][last][top][bottom][index][help] */
 525 {
 526         pgd_t * pgd;
 527         pmd_t * pmd;
 528         pte_t * pte;
 529 
 530         if (page >= high_memory)
 531                 printk("put_dirty_page: trying to put page %08lx at %08lx\n",page,address);
 532         if (mem_map[MAP_NR(page)].count != 1)
 533                 printk("mem_map disagrees with %08lx at %08lx\n",page,address);
 534         pgd = pgd_offset(tsk->mm,address);
 535         pmd = pmd_alloc(pgd, address);
 536         if (!pmd) {
 537                 free_page(page);
 538                 oom(tsk);
 539                 return 0;
 540         }
 541         pte = pte_alloc(pmd, address);
 542         if (!pte) {
 543                 free_page(page);
 544                 oom(tsk);
 545                 return 0;
 546         }
 547         if (!pte_none(*pte)) {
 548                 printk("put_dirty_page: page already exists\n");
 549                 pte_clear(pte);
 550                 invalidate();
 551         }
 552         set_pte(pte, pte_mkwrite(pte_mkdirty(mk_pte(page, PAGE_COPY))));
 553 /* no need for invalidate */
 554         return page;
 555 }
 556 
 557 /*
 558  * This routine handles present pages, when users try to write
 559  * to a shared page. It is done by copying the page to a new address
 560  * and decrementing the shared-page counter for the old page.
 561  *
 562  * Goto-purists beware: the only reason for goto's here is that it results
 563  * in better assembly code.. The "default" path will see no jumps at all.
 564  *
 565  * Note that this routine assumes that the protection checks have been
 566  * done by the caller (the low-level page fault routine in most cases).
 567  * Thus we can safely just mark it writable once we've done any necessary
 568  * COW.
 569  *
 570  * We also mark the page dirty at this point even though the page will
 571  * change only once the write actually happens. This avoids a few races,
 572  * and potentially makes it more efficient.
 573  */
 574 void do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
     /* [previous][next][first][last][top][bottom][index][help] */
 575         unsigned long address, int write_access)
 576 {
 577         pgd_t *page_dir;
 578         pmd_t *page_middle;
 579         pte_t *page_table, pte;
 580         unsigned long old_page, new_page;
 581 
 582         new_page = __get_free_page(GFP_KERNEL);
 583         page_dir = pgd_offset(vma->vm_mm, address);
 584         if (pgd_none(*page_dir))
 585                 goto end_wp_page;
 586         if (pgd_bad(*page_dir))
 587                 goto bad_wp_pagedir;
 588         page_middle = pmd_offset(page_dir, address);
 589         if (pmd_none(*page_middle))
 590                 goto end_wp_page;
 591         if (pmd_bad(*page_middle))
 592                 goto bad_wp_pagemiddle;
 593         page_table = pte_offset(page_middle, address);
 594         pte = *page_table;
 595         if (!pte_present(pte))
 596                 goto end_wp_page;
 597         if (pte_write(pte))
 598                 goto end_wp_page;
 599         old_page = pte_page(pte);
 600         if (old_page >= high_memory)
 601                 goto bad_wp_page;
 602         tsk->min_flt++;
 603         /*
 604          * Do we need to copy?
 605          */
 606         if (mem_map[MAP_NR(old_page)].count != 1) {
 607                 if (new_page) {
 608                         if (mem_map[MAP_NR(old_page)].reserved)
 609                                 ++vma->vm_mm->rss;
 610                         copy_page(old_page,new_page);
 611                         set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
 612                         free_page(old_page);
 613                         invalidate();
 614                         return;
 615                 }
 616                 set_pte(page_table, BAD_PAGE);
 617                 free_page(old_page);
 618                 oom(tsk);
 619                 invalidate();
 620                 return;
 621         }
 622         set_pte(page_table, pte_mkdirty(pte_mkwrite(pte)));
 623         invalidate();
 624         if (new_page)
 625                 free_page(new_page);
 626         return;
 627 bad_wp_page:
 628         printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page);
 629         send_sig(SIGKILL, tsk, 1);
 630         goto end_wp_page;
 631 bad_wp_pagemiddle:
 632         printk("do_wp_page: bogus page-middle at address %08lx (%08lx)\n", address, pmd_val(*page_middle));
 633         send_sig(SIGKILL, tsk, 1);
 634         goto end_wp_page;
 635 bad_wp_pagedir:
 636         printk("do_wp_page: bogus page-dir entry at address %08lx (%08lx)\n", address, pgd_val(*page_dir));
 637         send_sig(SIGKILL, tsk, 1);
 638 end_wp_page:
 639         if (new_page)
 640                 free_page(new_page);
 641         return;
 642 }
 643 
 644 /*
 645  * Ugly, ugly, but the goto's result in better assembly..
 646  */
 647 int verify_area(int type, const void * addr, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 648 {
 649         struct vm_area_struct * vma;
 650         unsigned long start = (unsigned long) addr;
 651 
 652         /* If the current user space is mapped to kernel space (for the
 653          * case where we use a fake user buffer with get_fs/set_fs()) we
 654          * don't expect to find the address in the user vm map.
 655          */
 656         if (get_fs() == get_ds())
 657                 return 0;
 658 
 659         vma = find_vma(current, start);
 660         if (!vma)
 661                 goto bad_area;
 662         if (vma->vm_start <= start)
 663                 goto good_area;
 664         if (!(vma->vm_flags & VM_GROWSDOWN))
 665                 goto bad_area;
 666         if (vma->vm_end - start > current->rlim[RLIMIT_STACK].rlim_cur)
 667                 goto bad_area;
 668 
 669 good_area:
 670         if (type == VERIFY_WRITE)
 671                 goto check_write;
 672         for (;;) {
 673                 struct vm_area_struct * next;
 674                 if (!(vma->vm_flags & VM_READ))
 675                         goto bad_area;
 676                 if (vma->vm_end - start >= size)
 677                         return 0;
 678                 next = vma->vm_next;
 679                 if (!next || vma->vm_end != next->vm_start)
 680                         goto bad_area;
 681                 vma = next;
 682         }
 683 
 684 check_write:
 685         if (!(vma->vm_flags & VM_WRITE))
 686                 goto bad_area;
 687         if (!wp_works_ok)
 688                 goto check_wp_fault_by_hand;
 689         for (;;) {
 690                 if (vma->vm_end - start >= size)
 691                         break;
 692                 if (!vma->vm_next || vma->vm_end != vma->vm_next->vm_start)
 693                         goto bad_area;
 694                 vma = vma->vm_next;
 695                 if (!(vma->vm_flags & VM_WRITE))
 696                         goto bad_area;
 697         }
 698         return 0;
 699 
 700 check_wp_fault_by_hand:
 701         size--;
 702         size += start & ~PAGE_MASK;
 703         size >>= PAGE_SHIFT;
 704         start &= PAGE_MASK;
 705 
 706         for (;;) {
 707                 do_wp_page(current, vma, start, 1);
 708                 if (!size)
 709                         break;
 710                 size--;
 711                 start += PAGE_SIZE;
 712                 if (start < vma->vm_end)
 713                         continue;
 714                 vma = vma->vm_next;
 715                 if (!vma || vma->vm_start != start)
 716                         goto bad_area;
 717                 if (!(vma->vm_flags & VM_WRITE))
 718                         goto bad_area;;
 719         }
 720         return 0;
 721 
 722 bad_area:
 723         return -EFAULT;
 724 }
 725 
 726 static inline void get_empty_page(struct task_struct * tsk, struct vm_area_struct * vma, pte_t * page_table)
     /* [previous][next][first][last][top][bottom][index][help] */
 727 {
 728         unsigned long tmp;
 729 
 730         if (!(tmp = get_free_page(GFP_KERNEL))) {
 731                 oom(tsk);
 732                 put_page(page_table, BAD_PAGE);
 733                 return;
 734         }
 735         put_page(page_table, pte_mkwrite(mk_pte(tmp, vma->vm_page_prot)));
 736 }
 737 
 738 /*
 739  * try_to_share() checks the page at address "address" in the task "p",
 740  * to see if it exists, and if it is clean. If so, share it with the current
 741  * task.
 742  *
 743  * NOTE! This assumes we have checked that p != current, and that they
 744  * share the same inode and can generally otherwise be shared.
 745  */
 746 static int try_to_share(unsigned long to_address, struct vm_area_struct * to_area,
     /* [previous][next][first][last][top][bottom][index][help] */
 747         unsigned long from_address, struct vm_area_struct * from_area,
 748         unsigned long newpage)
 749 {
 750         pgd_t * from_dir, * to_dir;
 751         pmd_t * from_middle, * to_middle;
 752         pte_t * from_table, * to_table;
 753         pte_t from, to;
 754 
 755         from_dir = pgd_offset(from_area->vm_mm,from_address);
 756 /* is there a page-directory at from? */
 757         if (pgd_none(*from_dir))
 758                 return 0;
 759         if (pgd_bad(*from_dir)) {
 760                 printk("try_to_share: bad page directory %08lx\n", pgd_val(*from_dir));
 761                 pgd_clear(from_dir);
 762                 return 0;
 763         }
 764         from_middle = pmd_offset(from_dir, from_address);
 765 /* is there a mid-directory at from? */
 766         if (pmd_none(*from_middle))
 767                 return 0;
 768         if (pmd_bad(*from_middle)) {
 769                 printk("try_to_share: bad mid directory %08lx\n", pmd_val(*from_middle));
 770                 pmd_clear(from_middle);
 771                 return 0;
 772         }
 773         from_table = pte_offset(from_middle, from_address);
 774         from = *from_table;
 775 /* is the page present? */
 776         if (!pte_present(from))
 777                 return 0;
 778 /* if it is dirty it must be from a shared mapping to be shared */
 779         if (pte_dirty(from)) {
 780                 if (!(from_area->vm_flags & VM_SHARED))
 781                         return 0;
 782         }
 783 /* is the page reasonable at all? */
 784         if (pte_page(from) >= high_memory)
 785                 return 0;
 786         if (mem_map[MAP_NR(pte_page(from))].reserved)
 787                 return 0;
 788 /* is the destination ok? */
 789         to_dir = pgd_offset(to_area->vm_mm,to_address);
 790 /* is there a page-directory at to? */
 791         if (pgd_none(*to_dir))
 792                 return 0;
 793         if (pgd_bad(*to_dir)) {
 794                 printk("try_to_share: bad page directory %08lx\n", pgd_val(*to_dir));
 795                 return 0;
 796         }
 797         to_middle = pmd_offset(to_dir, to_address);
 798 /* is there a mid-directory at to? */
 799         if (pmd_none(*to_middle))
 800                 return 0;
 801         if (pmd_bad(*to_middle)) {
 802                 printk("try_to_share: bad mid directory %08lx\n", pmd_val(*to_middle));
 803                 return 0;
 804         }
 805         to_table = pte_offset(to_middle, to_address);
 806         to = *to_table;
 807         if (!pte_none(to))
 808                 return 0;
 809 /* do we copy? */
 810         if (newpage) {
 811                 /* if it's in the swap cache, it's dirty by implication */
 812                 /* so we can't use it if it's not from a shared mapping */
 813                 if (in_swap_cache(pte_page(from))) {
 814                         if (!(from_area->vm_flags & VM_SHARED))
 815                                 return 0;
 816                 }
 817                 copy_page(pte_page(from), newpage);
 818                 set_pte(to_table, mk_pte(newpage, to_area->vm_page_prot));
 819                 return 1;
 820         }
 821 /*
 822  * do a final swap-cache test before sharing them: if it's in the swap
 823  * cache, we have to remove it now, as we get two pointers to the same
 824  * physical page and the cache can't handle it. Mark the original dirty.
 825  *
 826  * NOTE! Even if "from" is dirty, "to" will be clean: if we get here
 827  * with a dirty "from", the from-mapping is a shared map, so we can trust
 828  * the page contents to be up-to-date
 829  */
 830         if (in_swap_cache(pte_page(from))) {
 831                 if (!(from_area->vm_flags & VM_SHARED))
 832                         return 0;
 833                 set_pte(from_table, pte_mkdirty(from));
 834                 delete_from_swap_cache(pte_page(from));
 835         }
 836         mem_map[MAP_NR(pte_page(from))].count++;
 837         set_pte(to_table, mk_pte(pte_page(from), to_area->vm_page_prot));
 838 /* Check if we need to do anything at all to the 'from' field */
 839         if (!pte_write(from))
 840                 return 1;
 841         if (from_area->vm_flags & VM_SHARED)
 842                 return 1;
 843 /* ok, need to mark it read-only, so invalidate any possible old TB entry */
 844         set_pte(from_table, pte_wrprotect(from));
 845         invalidate();
 846         return 1;
 847 }
 848 
 849 /*
 850  * share_page() tries to find a process that could share a page with
 851  * the current one.
 852  *
 853  * We first check if it is at all feasible by checking inode->i_count.
 854  * It should be >1 if there are other tasks sharing this inode.
 855  */
 856 static int share_page(struct vm_area_struct * area, unsigned long address,
     /* [previous][next][first][last][top][bottom][index][help] */
 857         int write_access, unsigned long newpage)
 858 {
 859         struct inode * inode;
 860         unsigned long offset;
 861         unsigned long from_address;
 862         unsigned long give_page;
 863         struct vm_area_struct * mpnt;
 864 
 865         if (!area || !(inode = area->vm_inode) || inode->i_count < 2)
 866                 return 0;
 867         /* do we need to copy or can we just share? */
 868         give_page = 0;
 869         if (write_access && !(area->vm_flags & VM_SHARED)) {
 870                 if (!newpage)
 871                         return 0;
 872                 give_page = newpage;
 873         }
 874         offset = address - area->vm_start + area->vm_offset;
 875         /* See if there is something in the VM we can share pages with. */
 876         /* Traverse the entire circular i_mmap list, except `area' itself. */
 877         for (mpnt = area->vm_next_share; mpnt != area; mpnt = mpnt->vm_next_share) {
 878                 /* must be same inode */
 879                 if (mpnt->vm_inode != inode) {
 880                         printk("Aiee! Corrupt vm_area_struct i_mmap ring\n");
 881                         break;  
 882                 }
 883                 /* offsets must be mutually page-aligned */
 884                 if ((mpnt->vm_offset ^ area->vm_offset) & ~PAGE_MASK)
 885                         continue;
 886                 /* the other area must actually cover the wanted page.. */
 887                 from_address = offset + mpnt->vm_start - mpnt->vm_offset;
 888                 if (from_address < mpnt->vm_start || from_address >= mpnt->vm_end)
 889                         continue;
 890                 /* .. NOW we can actually try to use the same physical page */
 891                 if (!try_to_share(address, area, from_address, mpnt, give_page))
 892                         continue;
 893                 /* free newpage if we never used it.. */
 894                 if (give_page || !newpage)
 895                         return 1;
 896                 free_page(newpage);
 897                 return 1;
 898         }
 899         return 0;
 900 }
 901 
 902 /*
 903  * This function tries to find a page that is shared with the buffer cache,
 904  * and if so it moves the buffer cache to a new location.
 905  *
 906  * It returns non-zero if we used up the "new_page" page.
 907  */
 908 static int unshare(struct vm_area_struct *vma, unsigned long address, unsigned long new_page)
     /* [previous][next][first][last][top][bottom][index][help] */
 909 {
 910         pgd_t *page_dir;
 911         pmd_t *page_middle;
 912         pte_t *page_table, pte;
 913         unsigned long old_page;
 914         struct buffer_head * bh, * tmp;
 915 
 916         page_dir = pgd_offset(vma->vm_mm, address);
 917         if (pgd_none(*page_dir))
 918                 return 0;
 919         if (pgd_bad(*page_dir)) {
 920                 printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
 921                 pgd_clear(page_dir);
 922                 return 0;
 923         }
 924         page_middle = pmd_offset(page_dir, address);
 925         if (pmd_none(*page_middle))
 926                 return 0;
 927         if (pmd_bad(*page_middle)) {
 928                 printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
 929                 pmd_clear(page_middle);
 930                 return 0;
 931         }
 932         page_table = pte_offset(page_middle, address);
 933         pte = *page_table;
 934         if (!pte_present(pte))
 935                 return 0;
 936         old_page = pte_page(pte);
 937         if (MAP_NR(old_page) > MAP_NR(high_memory))
 938                 return 0;
 939         address &= ~PAGE_MASK;
 940         memset((void *) (old_page + address), 0, PAGE_SIZE - address);
 941         bh = buffer_pages[MAP_NR(old_page)];
 942         if (!bh)
 943                 return 0;
 944         if (!new_page) {
 945                 printk("Aieee... unshare(): no page available\n");
 946                 return 0;
 947         }
 948         buffer_pages[MAP_NR(old_page)] = NULL;
 949         copy_page(old_page, new_page);
 950         free_page(old_page);
 951         old_page -= new_page;
 952         buffer_pages[MAP_NR(new_page)] = bh;
 953         tmp = bh;
 954         do {
 955                 tmp->b_data -= old_page;
 956                 tmp = tmp->b_this_page;
 957         } while (tmp != bh);
 958         return 1;
 959 }
 960 
 961 /*
 962  * Handle all mappings that got truncated by a "truncate()"
 963  * system call.
 964  *
 965  * NOTE! We have to be ready to update the memory sharing
 966  * between the file and the memory map for a potential last
 967  * incomplete page.  Ugly, but necessary.
 968  */
 969 void vmtruncate(struct inode * inode, unsigned long offset)
     /* [previous][next][first][last][top][bottom][index][help] */
 970 {
 971         unsigned long page;
 972         struct vm_area_struct * mpnt;
 973 
 974         if (!inode->i_mmap)
 975                 return;
 976         page = __get_free_page(GFP_KERNEL);
 977         mpnt = inode->i_mmap;
 978         if (!mpnt) {
 979                 free_page(page);
 980                 return;
 981         }
 982         do {
 983                 unsigned long start = mpnt->vm_start;
 984                 unsigned long len = mpnt->vm_end - start;
 985                 unsigned long diff;
 986 
 987                 /* mapping wholly truncated? */
 988                 if (mpnt->vm_offset >= offset) {
 989                         zap_page_range(mpnt->vm_mm, start, len);
 990                         continue;
 991                 }
 992                 /* mapping wholly unaffected? */
 993                 diff = offset - mpnt->vm_offset;
 994                 if (diff >= len)
 995                         continue;
 996                 /* Ok, partially affected.. */
 997                 start += diff;
 998                 len = (len - diff) & PAGE_MASK;
 999                 /* Ugh, here comes the _really_ ugly part.. */
1000                 if (start & ~PAGE_MASK) {
1001                         if (unshare(mpnt, start, page))
1002                                 page = 0;
1003                         start = (start + ~PAGE_MASK) & PAGE_MASK;
1004                 }
1005                 zap_page_range(mpnt->vm_mm, start, len);
1006         } while ((mpnt = mpnt->vm_next_share) != inode->i_mmap);
1007         free_page(page);
1008 }
1009 
1010 /*
1011  * fill in an empty page-table if none exists.
1012  */
1013 static inline pte_t * get_empty_pgtable(struct task_struct * tsk,unsigned long address)
     /* [previous][next][first][last][top][bottom][index][help] */
1014 {
1015         pgd_t *pgd;
1016         pmd_t *pmd;
1017         pte_t *pte;
1018 
1019         pgd = pgd_offset(tsk->mm, address);
1020         pmd = pmd_alloc(pgd, address);
1021         if (!pmd) {
1022                 oom(tsk);
1023                 return NULL;
1024         }
1025         pte = pte_alloc(pmd, address);
1026         if (!pte) {
1027                 oom(tsk);
1028                 return NULL;
1029         }
1030         return pte;
1031 }
1032 
1033 static inline void do_swap_page(struct task_struct * tsk, 
     /* [previous][next][first][last][top][bottom][index][help] */
1034         struct vm_area_struct * vma, unsigned long address,
1035         pte_t * page_table, pte_t entry, int write_access)
1036 {
1037         pte_t page;
1038 
1039         if (!vma->vm_ops || !vma->vm_ops->swapin) {
1040                 swap_in(tsk, vma, page_table, pte_val(entry), write_access);
1041                 return;
1042         }
1043         page = vma->vm_ops->swapin(vma, address - vma->vm_start + vma->vm_offset, pte_val(entry));
1044         if (pte_val(*page_table) != pte_val(entry)) {
1045                 free_page(pte_page(page));
1046                 return;
1047         }
1048         if (mem_map[MAP_NR(pte_page(page))].count > 1 && !(vma->vm_flags & VM_SHARED))
1049                 page = pte_wrprotect(page);
1050         ++vma->vm_mm->rss;
1051         ++tsk->maj_flt;
1052         set_pte(page_table, page);
1053         return;
1054 }
1055 
1056 /*
1057  * do_no_page() tries to create a new page mapping. It aggressively
1058  * tries to share with existing pages, but makes a separate copy if
1059  * the "write_access" parameter is true in order to avoid the next
1060  * page fault.
1061  */
1062 void do_no_page(struct task_struct * tsk, struct vm_area_struct * vma,
     /* [previous][next][first][last][top][bottom][index][help] */
1063         unsigned long address, int write_access)
1064 {
1065         pte_t * page_table;
1066         pte_t entry;
1067         unsigned long page;
1068 
1069         page_table = get_empty_pgtable(tsk, address);
1070         if (!page_table)
1071                 return;
1072         entry = *page_table;
1073         if (pte_present(entry))
1074                 return;
1075         if (!pte_none(entry)) {
1076                 do_swap_page(tsk, vma, address, page_table, entry, write_access);
1077                 return;
1078         }
1079         address &= PAGE_MASK;
1080         if (!vma->vm_ops || !vma->vm_ops->nopage) {
1081                 ++vma->vm_mm->rss;
1082                 ++tsk->min_flt;
1083                 get_empty_page(tsk, vma, page_table);
1084                 return;
1085         }
1086         page = __get_free_page(GFP_KERNEL);
1087         if (share_page(vma, address, write_access, page)) {
1088                 ++vma->vm_mm->rss;
1089                 ++tsk->min_flt;
1090                 return;
1091         }
1092         if (!page) {
1093                 oom(tsk);
1094                 put_page(page_table, BAD_PAGE);
1095                 return;
1096         }
1097         ++tsk->maj_flt;
1098         ++vma->vm_mm->rss;
1099         /*
1100          * The fourth argument is "no_share", which tells the low-level code
1101          * to copy, not share the page even if sharing is possible.  It's
1102          * essentially an early COW detection 
1103          */
1104         page = vma->vm_ops->nopage(vma, address, page,
1105                 write_access && !(vma->vm_flags & VM_SHARED));
1106         if (share_page(vma, address, write_access, 0)) {
1107                 free_page(page);
1108                 return;
1109         }
1110         /*
1111          * This silly early PAGE_DIRTY setting removes a race
1112          * due to the bad i386 page protection. But it's valid
1113          * for other architectures too.
1114          *
1115          * Note that if write_access is true, we either now have
1116          * a exclusive copy of the page, or this is a shared mapping,
1117          * so we can make it writable and dirty to avoid having to
1118          * handle that later.
1119          */
1120         entry = mk_pte(page, vma->vm_page_prot);
1121         if (write_access) {
1122                 entry = pte_mkwrite(pte_mkdirty(entry));
1123         } else if (mem_map[MAP_NR(page)].count > 1 && !(vma->vm_flags & VM_SHARED))
1124                 entry = pte_wrprotect(entry);
1125         put_page(page_table, entry);
1126 }
1127 
1128 /*
1129  * The above separate functions for the no-page and wp-page
1130  * cases will go away (they mostly do the same thing anyway),
1131  * and we'll instead use only a general "handle_mm_fault()".
1132  *
1133  * These routines also need to handle stuff like marking pages dirty
1134  * and/or accessed for architectures that don't do it in hardware (most
1135  * RISC architectures).  The early dirtying is also good on the i386.
1136  *
1137  * There is also a hook called "update_mmu_cache()" that architectures
1138  * with external mmu caches can use to update those (ie the Sparc or
1139  * PowerPC hashed page tables that act as extended TLBs).
1140  */
1141 static inline void handle_pte_fault(struct vm_area_struct * vma, unsigned long address,
     /* [previous][next][first][last][top][bottom][index][help] */
1142         int write_access, pte_t * pte)
1143 {
1144         if (!pte_present(*pte)) {
1145                 do_no_page(current, vma, address, write_access);
1146                 return;
1147         }
1148         set_pte(pte, pte_mkyoung(*pte));
1149         if (!write_access)
1150                 return;
1151         if (pte_write(*pte)) {
1152                 set_pte(pte, pte_mkdirty(*pte));
1153                 return;
1154         }
1155         do_wp_page(current, vma, address, write_access);
1156 }
1157 
1158 void handle_mm_fault(struct vm_area_struct * vma, unsigned long address,
     /* [previous][next][first][last][top][bottom][index][help] */
1159         int write_access)
1160 {
1161         pgd_t *pgd;
1162         pmd_t *pmd;
1163         pte_t *pte;
1164 
1165         pgd = pgd_offset(vma->vm_mm, address);
1166         pmd = pmd_alloc(pgd, address);
1167         if (!pmd)
1168                 goto no_memory;
1169         pte = pte_alloc(pmd, address);
1170         if (!pte)
1171                 goto no_memory;
1172         handle_pte_fault(vma, address, write_access, pte);
1173         update_mmu_cache(vma, address, *pte);
1174         return;
1175 no_memory:
1176         oom(current);
1177 }

/* [previous][next][first][last][top][bottom][index][help] */