root/mm/filemap.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. invalidate_inode_pages
  2. truncate_inode_pages
  3. shrink_mmap
  4. page_unuse
  5. update_vm_cache
  6. add_to_page_cache
  7. try_to_read_ahead
  8. __wait_on_page
  9. generic_file_readahead
  10. generic_file_read
  11. fill_page
  12. filemap_nopage
  13. do_write_page
  14. filemap_write_page
  15. filemap_swapout
  16. filemap_swapin
  17. filemap_sync_pte
  18. filemap_sync_pte_range
  19. filemap_sync_pmd_range
  20. filemap_sync
  21. filemap_unmap
  22. generic_file_mmap
  23. msync_interval
  24. sys_msync

   1 /*
   2  *      linux/mm/filemap.c
   3  *
   4  * Copyright (C) 1994, 1995  Linus Torvalds
   5  */
   6 
   7 /*
   8  * This file handles the generic file mmap semantics used by
   9  * most "normal" filesystems (but you don't /have/ to use this:
  10  * the NFS filesystem does this differently, for example)
  11  */
  12 #include <linux/stat.h>
  13 #include <linux/sched.h>
  14 #include <linux/kernel.h>
  15 #include <linux/mm.h>
  16 #include <linux/shm.h>
  17 #include <linux/errno.h>
  18 #include <linux/mman.h>
  19 #include <linux/string.h>
  20 #include <linux/malloc.h>
  21 #include <linux/fs.h>
  22 #include <linux/locks.h>
  23 #include <linux/pagemap.h>
  24 #include <linux/swap.h>
  25 
  26 #include <asm/segment.h>
  27 #include <asm/system.h>
  28 #include <asm/pgtable.h>
  29 
  30 /*
  31  * Shared mappings implemented 30.11.1994. It's not fully working yet,
  32  * though.
  33  *
  34  * Shared mappings now work. 15.8.1995  Bruno.
  35  */
  36 
  37 unsigned long page_cache_size = 0;
  38 struct page * page_hash_table[PAGE_HASH_SIZE];
  39 
  40 /*
  41  * Simple routines for both non-shared and shared mappings.
  42  */
  43 
  44 /*
  45  * Invalidate the pages of an inode, removing all pages that aren't
  46  * locked down (those are sure to be up-to-date anyway, so we shouldn't
  47  * invalidate them).
  48  */
  49 void invalidate_inode_pages(struct inode * inode)
     /* [previous][next][first][last][top][bottom][index][help] */
  50 {
  51         struct page ** p;
  52         struct page * page;
  53 
  54         p = &inode->i_pages;
  55         while ((page = *p) != NULL) {
  56                 if (PageLocked(page)) {
  57                         p = &page->next;
  58                         continue;
  59                 }
  60                 inode->i_nrpages--;
  61                 if ((*p = page->next) != NULL)
  62                         (*p)->prev = page->prev;
  63                 page->dirty = 0;
  64                 page->next = NULL;
  65                 page->prev = NULL;
  66                 remove_page_from_hash_queue(page);
  67                 page->inode = NULL;
  68                 free_page(page_address(page));
  69                 continue;
  70         }
  71 }
  72 
  73 /*
  74  * Truncate the page cache at a set offset, removing the pages
  75  * that are beyond that offset (and zeroing out partial pages).
  76  */
  77 void truncate_inode_pages(struct inode * inode, unsigned long start)
     /* [previous][next][first][last][top][bottom][index][help] */
  78 {
  79         struct page ** p;
  80         struct page * page;
  81 
  82 repeat:
  83         p = &inode->i_pages;
  84         while ((page = *p) != NULL) {
  85                 unsigned long offset = page->offset;
  86 
  87                 /* page wholly truncated - free it */
  88                 if (offset >= start) {
  89                         if (PageLocked(page)) {
  90                                 wait_on_page(page);
  91                                 goto repeat;
  92                         }
  93                         inode->i_nrpages--;
  94                         if ((*p = page->next) != NULL)
  95                                 (*p)->prev = page->prev;
  96                         page->dirty = 0;
  97                         page->next = NULL;
  98                         page->prev = NULL;
  99                         remove_page_from_hash_queue(page);
 100                         page->inode = NULL;
 101                         free_page(page_address(page));
 102                         continue;
 103                 }
 104                 p = &page->next;
 105                 offset = start - offset;
 106                 /* partial truncate, clear end of page */
 107                 if (offset < PAGE_SIZE)
 108                         memset((void *) (offset + page_address(page)), 0, PAGE_SIZE - offset);
 109         }
 110 }
 111 
 112 int shrink_mmap(int priority, int dma)
     /* [previous][next][first][last][top][bottom][index][help] */
 113 {
 114         static int clock = 0;
 115         struct page * page;
 116         unsigned long limit = MAP_NR(high_memory);
 117         struct buffer_head *tmp, *bh;
 118 
 119         priority = (limit<<2) >> priority;
 120         page = mem_map + clock;
 121         do {
 122                 priority--;
 123                 if (PageLocked(page))
 124                         goto next;
 125                 if (dma && !PageDMA(page))
 126                         goto next;
 127                 /* First of all, regenerate the page's referenced bit
 128                    from any buffers in the page */
 129                 bh = page->buffers;
 130                 if (bh) {
 131                         tmp = bh;
 132                         do {
 133                                 if (buffer_touched(tmp)) {
 134                                         clear_bit(BH_Touched, &tmp->b_state);
 135                                         set_bit(PG_referenced, &page->flags);
 136                                 }
 137                                 tmp = tmp->b_this_page;
 138                         } while (tmp != bh);
 139                 }
 140 
 141                 /* We can't throw away shared pages, but we do mark
 142                    them as referenced.  This relies on the fact that
 143                    no page is currently in both the page cache and the
 144                    buffer cache; we'd have to modify the following
 145                    test to allow for that case. */
 146 
 147                 switch (page->count) {
 148                         case 1:
 149                                 /* If it has been referenced recently, don't free it */
 150                                 if (clear_bit(PG_referenced, &page->flags))
 151                                         break;
 152 
 153                                 /* is it a page cache page? */
 154                                 if (page->inode) {
 155                                         remove_page_from_hash_queue(page);
 156                                         remove_page_from_inode_queue(page);
 157                                         free_page(page_address(page));
 158                                         return 1;
 159                                 }
 160 
 161                                 /* is it a buffer cache page? */
 162                                 if (bh && try_to_free_buffer(bh, &bh, 6))
 163                                         return 1;
 164                                 break;
 165 
 166                         default:
 167                                 /* more than one users: we can't throw it away */
 168                                 set_bit(PG_referenced, &page->flags);
 169                                 /* fall through */
 170                         case 0:
 171                                 /* nothing */
 172                 }
 173 next:
 174                 page++;
 175                 clock++;
 176                 if (clock >= limit) {
 177                         clock = 0;
 178                         page = mem_map;
 179                 }
 180         } while (priority > 0);
 181         return 0;
 182 }
 183 
 184 /*
 185  * This is called from try_to_swap_out() when we try to get rid of some
 186  * pages..  If we're unmapping the last occurrence of this page, we also
 187  * free it from the page hash-queues etc, as we don't want to keep it
 188  * in-core unnecessarily.
 189  */
 190 unsigned long page_unuse(unsigned long page)
     /* [previous][next][first][last][top][bottom][index][help] */
 191 {
 192         struct page * p = mem_map + MAP_NR(page);
 193         int count = p->count;
 194 
 195         if (count != 2)
 196                 return count;
 197         if (!p->inode)
 198                 return count;
 199         remove_page_from_hash_queue(p);
 200         remove_page_from_inode_queue(p);
 201         free_page(page);
 202         return 1;
 203 }
 204 
 205 /*
 206  * Update a page cache copy, when we're doing a "write()" system call
 207  * See also "update_vm_cache()".
 208  */
 209 void update_vm_cache(struct inode * inode, unsigned long pos, const char * buf, int count)
     /* [previous][next][first][last][top][bottom][index][help] */
 210 {
 211         unsigned long offset, len;
 212 
 213         offset = (pos & ~PAGE_MASK);
 214         pos = pos & PAGE_MASK;
 215         len = PAGE_SIZE - offset;
 216         do {
 217                 struct page * page;
 218 
 219                 if (len > count)
 220                         len = count;
 221                 page = find_page(inode, pos);
 222                 if (page) {
 223                         unsigned long addr;
 224 
 225                         wait_on_page(page);
 226                         addr = page_address(page);
 227                         memcpy((void *) (offset + addr), buf, len);
 228                         free_page(addr);
 229                 }
 230                 count -= len;
 231                 buf += len;
 232                 len = PAGE_SIZE;
 233                 offset = 0;
 234                 pos += PAGE_SIZE;
 235         } while (count);
 236 }
 237 
 238 static inline void add_to_page_cache(struct page * page,
     /* [previous][next][first][last][top][bottom][index][help] */
 239         struct inode * inode, unsigned long offset)
 240 {
 241         page->count++;
 242         page->flags &= ~((1 << PG_uptodate) | (1 << PG_error));
 243         page->offset = offset;
 244         add_page_to_inode_queue(inode, page);
 245         add_page_to_hash_queue(inode, page);
 246 }
 247 
 248 /*
 249  * Try to read ahead in the file. "page_cache" is a potentially free page
 250  * that we could use for the cache (if it is 0 we can try to create one,
 251  * this is all overlapped with the IO on the previous page finishing anyway)
 252  */
 253 static unsigned long try_to_read_ahead(struct inode * inode, unsigned long offset, unsigned long page_cache)
     /* [previous][next][first][last][top][bottom][index][help] */
 254 {
 255         struct page * page;
 256 
 257         offset &= PAGE_MASK;
 258         if (!page_cache) {
 259                 page_cache = __get_free_page(GFP_KERNEL);
 260                 if (!page_cache)
 261                         return 0;
 262         }
 263         if (offset >= inode->i_size)
 264                 return page_cache;
 265 #if 1
 266         page = find_page(inode, offset);
 267         if (page) {
 268                 page->count--;
 269                 return page_cache;
 270         }
 271         /*
 272          * Ok, add the new page to the hash-queues...
 273          */
 274         page = mem_map + MAP_NR(page_cache);
 275         add_to_page_cache(page, inode, offset);
 276         inode->i_op->readpage(inode, page);
 277         free_page(page_cache);
 278         return 0;
 279 #else
 280         return page_cache;
 281 #endif
 282 }
 283 
 284 /* 
 285  * Wait for IO to complete on a locked page.
 286  */
 287 void __wait_on_page(struct page *page)
     /* [previous][next][first][last][top][bottom][index][help] */
 288 {
 289         struct wait_queue wait = { current, NULL };
 290 
 291         page->count++;
 292         add_wait_queue(&page->wait, &wait);
 293 repeat:
 294         run_task_queue(&tq_disk);
 295         current->state = TASK_UNINTERRUPTIBLE;
 296         if (PageLocked(page)) {
 297                 schedule();
 298                 goto repeat;
 299         }
 300         remove_wait_queue(&page->wait, &wait);
 301         page->count--;
 302         current->state = TASK_RUNNING;
 303 }
 304 
 305 
 306 /*
 307  * This is a generic file read routine, and uses the
 308  * inode->i_op->readpage() function for the actual low-level
 309  * stuff.
 310  *
 311  * This is really ugly. But the goto's actually try to clarify some
 312  * of the logic when it comes to error handling etc.
 313  */
 314 #define MAX_READAHEAD (PAGE_SIZE*8)
 315 #define MIN_READAHEAD (PAGE_SIZE)
 316 
 317 static inline unsigned long generic_file_readahead(struct file * filp, struct inode * inode,
     /* [previous][next][first][last][top][bottom][index][help] */
 318         int try_async, unsigned long pos, struct page * page,
 319         unsigned long page_cache)
 320 {
 321         unsigned long max_ahead, ahead;
 322         unsigned long rapos, ppos;
 323 
 324         ppos = pos & PAGE_MASK;
 325 /*
 326  * If the current page is locked, try some synchronous read-ahead in order
 327  * to avoid too small IO requests.
 328  */
 329         if (PageLocked(page)) {
 330                 max_ahead = filp->f_ramax;
 331                 rapos = ppos;
 332 /*              try_async = 1  */ /* Seems questionable */
 333         }
 334 /*
 335  * The current page is not locked
 336  * It may be the moment to try asynchronous read-ahead.
 337  */
 338         else {
 339 /*
 340  * Compute the position of the last page we have tried to read
 341  */
 342                 rapos = filp->f_rapos & PAGE_MASK;
 343                 if (rapos) rapos -= PAGE_SIZE;
 344 /*
 345  * If asynchronous is the good tactics and if the current position is
 346  * inside the previous read-ahead window,
 347  * check the last red page:
 348  * - if locked, previous IO request is probably not complete, and we will
 349  *    not try to do another IO request.
 350  * - if not locked, previous IO request is probably complete, and it is the
 351  *    good moment to try a new asynchronous read-ahead request.
 352  * try_async = 2 means that we have to force unplug of the device in
 353  * order to force call to the strategy routine of the disk driver and 
 354  * start IO asynchronously.
 355  */
 356                 if (try_async == 1 && pos <= filp->f_rapos &&
 357                          pos + filp->f_ralen >= filp->f_rapos) {
 358                         struct page *a_page;
 359 /*
 360  * Add ONE page to max_ahead in order to try to have the same IO max size as
 361  * synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_SIZE.
 362  */
 363                         max_ahead = filp->f_ramax + PAGE_SIZE;
 364 
 365                         if (rapos < inode->i_size) {
 366                                 a_page = find_page(inode, rapos);
 367                                 if (a_page) {
 368                                         if (PageLocked(a_page))
 369                                                 max_ahead = 0;
 370                                         a_page->count--;
 371                                 }
 372                         }
 373                         else
 374                                 max_ahead = 0;
 375                         try_async = 2;
 376                 }
 377                 else {
 378                         max_ahead = 0;
 379                 }
 380         }
 381 
 382 /*
 383  * Try to read pages.
 384  * We hope that ll_rw_blk() plug/unplug, coalescence and sort will work fine
 385  * enough to avoid too bad actuals IO requests.
 386  */
 387         ahead = 0;
 388         while (ahead < max_ahead) {
 389                 ahead += PAGE_SIZE;
 390                 page_cache = try_to_read_ahead(inode, rapos + ahead, page_cache);
 391         }
 392 /*
 393  * If we tried to read some pages,
 394  * Store the length of the current read-ahead window.
 395  * If necessary,
 396  *    Try to force unplug of the device in order to start an asynchronous
 397  *    read IO.
 398  */
 399         if (ahead > 0) {
 400                 filp->f_ralen = ahead;
 401                 if (try_async == 2) {
 402 /*
 403  * Schedule() should be changed to run_task_queue(...)
 404  */
 405                         run_task_queue(&tq_disk);
 406                         try_async = 1;
 407                 }
 408         }
 409 /*
 410  * Compute the new read-ahead position.
 411  * It is the position of the next byte.
 412  */
 413         filp->f_rapos = rapos + ahead + PAGE_SIZE;
 414 /*
 415  * Wait on the page if necessary
 416  */
 417         if (PageLocked(page)) {
 418                 __wait_on_page(page);
 419         }
 420         return page_cache;
 421 }
 422 
 423 
 424 int generic_file_read(struct inode * inode, struct file * filp, char * buf, int count)
     /* [previous][next][first][last][top][bottom][index][help] */
 425 {
 426         int error, read;
 427         unsigned long pos, page_cache;
 428         int try_async;
 429         
 430         if (count <= 0)
 431                 return 0;
 432         error = 0;
 433         read = 0;
 434         page_cache = 0;
 435 
 436         pos = filp->f_pos;
 437 /*
 438  * Dont believe f_reada
 439  * --------------------
 440  * f_reada is set to 0 by seek operations.
 441  * If we believe f_reada, small seek ops break asynchronous read-ahead.
 442  * That may be quite bad for small seeks or rewrites operations.
 443  * I prefer to check if the current position is inside the previous read-ahead
 444  * window.
 445  * If that's true, I assume that the file accesses are sequential enough to
 446  * continue asynchronous read-ahead.
 447  */
 448         if (pos <= filp->f_rapos && pos + filp->f_ralen >= filp->f_rapos) {
 449                 filp->f_reada = 1;
 450         }
 451 /*
 452  * Do minimum read-ahead at the beginning of the file.
 453  * Some tools only read the start of the file only.
 454  * Break read-ahead if the file position is after the previous read ahead
 455  * position or if read-ahead position is 0.
 456  */
 457         else if (pos+count < MIN_READAHEAD || !filp->f_rapos ||
 458                  pos > filp->f_rapos) {
 459                 filp->f_reada = 0;
 460         }
 461 
 462 /*
 463  * Now f_reada = 1 means that asynchronous read-ahead is the good tactics.
 464  * Will try asynchronous read-ahead as soon as possible.
 465  * Double the max read ahead size each time.
 466  *   That heuristic avoid to do some large IO for files that are not really
 467  *   accessed sequentially.
 468  */
 469         if (filp->f_reada) {
 470                 try_async = 1;
 471                 filp->f_ramax += filp->f_ramax;
 472         }
 473 /*
 474  * f_reada = 0 means that asynchronous read_ahead is quite bad.
 475  * Will not try asynchronous read-ahead first.
 476  * Reset to zero, read-ahead context.
 477  */
 478         else {
 479                 try_async = 0;
 480                 filp->f_rapos = 0;
 481                 filp->f_ralen = 0;
 482                 filp->f_ramax = 0;
 483         }
 484 
 485 /*
 486  * Compute a good value for read-ahead max
 487  * Try first some value near count.
 488  * Do at least MIN_READAHEAD and at most MAX_READAHEAD.
 489  * (Should be a little reworked)
 490  */
 491         if (filp->f_ramax < count)
 492                 filp->f_ramax = count & PAGE_MASK;
 493 
 494         if (filp->f_ramax < MIN_READAHEAD)
 495                 filp->f_ramax = MIN_READAHEAD;
 496         else if (filp->f_ramax > MAX_READAHEAD)
 497                 filp->f_ramax = MAX_READAHEAD;
 498 
 499         for (;;) {
 500                 struct page *page;
 501                 unsigned long offset, addr, nr;
 502 
 503                 if (pos >= inode->i_size)
 504                         break;
 505                 offset = pos & ~PAGE_MASK;
 506                 nr = PAGE_SIZE - offset;
 507                 /*
 508                  * Try to find the data in the page cache..
 509                  */
 510                 page = find_page(inode, pos & PAGE_MASK);
 511                 if (page)
 512                         goto found_page;
 513 
 514                 /*
 515                  * Ok, it wasn't cached, so we need to create a new
 516                  * page..
 517                  */
 518                 if (page_cache)
 519                         goto new_page;
 520 
 521                 error = -ENOMEM;
 522                 page_cache = __get_free_page(GFP_KERNEL);
 523                 if (!page_cache)
 524                         break;
 525                 error = 0;
 526 
 527                 /*
 528                  * That could have slept, so we need to check again..
 529                  */
 530                 if (pos >= inode->i_size)
 531                         break;
 532                 page = find_page(inode, pos & PAGE_MASK);
 533                 if (!page)
 534                         goto new_page;
 535 
 536 found_page:
 537                 addr = page_address(page);
 538                 if (nr > count)
 539                         nr = count;
 540 
 541                 page_cache = generic_file_readahead(filp, inode, try_async, pos, page, page_cache);
 542 
 543                 if (!PageUptodate(page))
 544                         goto read_page;
 545                 if (nr > inode->i_size - pos)
 546                         nr = inode->i_size - pos;
 547                 memcpy_tofs(buf, (void *) (addr + offset), nr);
 548                 free_page(addr);
 549                 buf += nr;
 550                 pos += nr;
 551                 read += nr;
 552                 count -= nr;
 553                 if (count)
 554                         continue;
 555                 break;
 556         
 557 
 558 new_page:
 559                 /*
 560                  * Ok, add the new page to the hash-queues...
 561                  */
 562                 addr = page_cache;
 563                 page = mem_map + MAP_NR(page_cache);
 564                 page_cache = 0;
 565                 add_to_page_cache(page, inode, pos & PAGE_MASK);
 566 
 567                 /*
 568                  * Error handling is tricky. If we get a read error,
 569                  * the cached page stays in the cache (but uptodate=0),
 570                  * and the next process that accesses it will try to
 571                  * re-read it. This is needed for NFS etc, where the
 572                  * identity of the reader can decide if we can read the
 573                  * page or not..
 574                  */
 575 read_page:
 576                 error = inode->i_op->readpage(inode, page);
 577                 if (!error)
 578                         goto found_page;
 579                 free_page(addr);
 580                 break;
 581         }
 582 
 583         filp->f_pos = pos;
 584         filp->f_reada = 1;
 585         if (page_cache)
 586                 free_page(page_cache);
 587         if (!IS_RDONLY(inode)) {
 588                 inode->i_atime = CURRENT_TIME;
 589                 inode->i_dirt = 1;
 590         }
 591         if (!read)
 592                 read = error;
 593         return read;
 594 }
 595 
 596 /*
 597  * Find a cached page and wait for it to become up-to-date, return
 598  * the page address.  Increments the page count.
 599  */
 600 static inline unsigned long fill_page(struct inode * inode, unsigned long offset)
     /* [previous][next][first][last][top][bottom][index][help] */
 601 {
 602         struct page * page;
 603         unsigned long new_page;
 604 
 605         page = find_page(inode, offset);
 606         if (page)
 607                 goto found_page_dont_free;
 608         new_page = __get_free_page(GFP_KERNEL);
 609         page = find_page(inode, offset);
 610         if (page)
 611                 goto found_page;
 612         if (!new_page)
 613                 return 0;
 614         page = mem_map + MAP_NR(new_page);
 615         new_page = 0;
 616         add_to_page_cache(page, inode, offset);
 617         inode->i_op->readpage(inode, page);
 618         if (PageLocked(page))
 619                 new_page = try_to_read_ahead(inode, offset + PAGE_SIZE, 0);
 620 found_page:
 621         if (new_page)
 622                 free_page(new_page);
 623 found_page_dont_free:
 624         wait_on_page(page);
 625         return page_address(page);
 626 }
 627 
 628 /*
 629  * Semantics for shared and private memory areas are different past the end
 630  * of the file. A shared mapping past the last page of the file is an error
 631  * and results in a SIGBUS, while a private mapping just maps in a zero page.
 632  */
 633 static unsigned long filemap_nopage(struct vm_area_struct * area, unsigned long address, int no_share)
     /* [previous][next][first][last][top][bottom][index][help] */
 634 {
 635         unsigned long offset;
 636         struct inode * inode = area->vm_inode;
 637         unsigned long page;
 638 
 639         offset = (address & PAGE_MASK) - area->vm_start + area->vm_offset;
 640         if (offset >= inode->i_size && (area->vm_flags & VM_SHARED) && area->vm_mm == current->mm)
 641                 return 0;
 642 
 643         page = fill_page(inode, offset);
 644         if (page && no_share) {
 645                 unsigned long new_page = __get_free_page(GFP_KERNEL);
 646                 if (new_page)
 647                         memcpy((void *) new_page, (void *) page, PAGE_SIZE);
 648                 free_page(page);
 649                 return new_page;
 650         }
 651         return page;
 652 }
 653 
 654 /*
 655  * Tries to write a shared mapped page to its backing store. May return -EIO
 656  * if the disk is full.
 657  */
 658 static inline int do_write_page(struct inode * inode, struct file * file,
     /* [previous][next][first][last][top][bottom][index][help] */
 659         const char * page, unsigned long offset)
 660 {
 661         int old_fs, retval;
 662         unsigned long size;
 663 
 664         size = offset + PAGE_SIZE;
 665         /* refuse to extend file size.. */
 666         if (S_ISREG(inode->i_mode)) {
 667                 if (size > inode->i_size)
 668                         size = inode->i_size;
 669                 /* Ho humm.. We should have tested for this earlier */
 670                 if (size < offset)
 671                         return -EIO;
 672         }
 673         size -= offset;
 674         old_fs = get_fs();
 675         set_fs(KERNEL_DS);
 676         retval = -EIO;
 677         if (size == file->f_op->write(inode, file, (const char *) page, size))
 678                 retval = 0;
 679         set_fs(old_fs);
 680         return retval;
 681 }
 682 
 683 static int filemap_write_page(struct vm_area_struct * vma,
     /* [previous][next][first][last][top][bottom][index][help] */
 684         unsigned long offset,
 685         unsigned long page)
 686 {
 687         int result;
 688         struct file file;
 689         struct inode * inode;
 690         struct buffer_head * bh;
 691 
 692         bh = mem_map[MAP_NR(page)].buffers;
 693         if (bh) {
 694                 /* whee.. just mark the buffer heads dirty */
 695                 struct buffer_head * tmp = bh;
 696                 do {
 697                         mark_buffer_dirty(tmp, 0);
 698                         tmp = tmp->b_this_page;
 699                 } while (tmp != bh);
 700                 return 0;
 701         }
 702 
 703         inode = vma->vm_inode;
 704         file.f_op = inode->i_op->default_file_ops;
 705         if (!file.f_op->write)
 706                 return -EIO;
 707         file.f_mode = 3;
 708         file.f_flags = 0;
 709         file.f_count = 1;
 710         file.f_inode = inode;
 711         file.f_pos = offset;
 712         file.f_reada = 0;
 713 
 714         down(&inode->i_sem);
 715         result = do_write_page(inode, &file, (const char *) page, offset);
 716         up(&inode->i_sem);
 717         return result;
 718 }
 719 
 720 
 721 /*
 722  * Swapping to a shared file: while we're busy writing out the page
 723  * (and the page still exists in memory), we save the page information
 724  * in the page table, so that "filemap_swapin()" can re-use the page
 725  * immediately if it is called while we're busy swapping it out..
 726  *
 727  * Once we've written it all out, we mark the page entry "empty", which
 728  * will result in a normal page-in (instead of a swap-in) from the now
 729  * up-to-date disk file.
 730  */
 731 int filemap_swapout(struct vm_area_struct * vma,
     /* [previous][next][first][last][top][bottom][index][help] */
 732         unsigned long offset,
 733         pte_t *page_table)
 734 {
 735         int error;
 736         unsigned long page = pte_page(*page_table);
 737         unsigned long entry = SWP_ENTRY(SHM_SWP_TYPE, MAP_NR(page));
 738 
 739         flush_cache_page(vma, (offset + vma->vm_start - vma->vm_offset));
 740         set_pte(page_table, __pte(entry));
 741         flush_tlb_page(vma, (offset + vma->vm_start - vma->vm_offset));
 742         error = filemap_write_page(vma, offset, page);
 743         if (pte_val(*page_table) == entry)
 744                 pte_clear(page_table);
 745         return error;
 746 }
 747 
 748 /*
 749  * filemap_swapin() is called only if we have something in the page
 750  * tables that is non-zero (but not present), which we know to be the
 751  * page index of a page that is busy being swapped out (see above).
 752  * So we just use it directly..
 753  */
 754 static pte_t filemap_swapin(struct vm_area_struct * vma,
     /* [previous][next][first][last][top][bottom][index][help] */
 755         unsigned long offset,
 756         unsigned long entry)
 757 {
 758         unsigned long page = SWP_OFFSET(entry);
 759 
 760         mem_map[page].count++;
 761         page = (page << PAGE_SHIFT) + PAGE_OFFSET;
 762         return mk_pte(page,vma->vm_page_prot);
 763 }
 764 
 765 
 766 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
     /* [previous][next][first][last][top][bottom][index][help] */
 767         unsigned long address, unsigned int flags)
 768 {
 769         pte_t pte = *ptep;
 770         unsigned long page;
 771         int error;
 772 
 773         if (!(flags & MS_INVALIDATE)) {
 774                 if (!pte_present(pte))
 775                         return 0;
 776                 if (!pte_dirty(pte))
 777                         return 0;
 778                 flush_cache_page(vma, address);
 779                 set_pte(ptep, pte_mkclean(pte));
 780                 flush_tlb_page(vma, address);
 781                 page = pte_page(pte);
 782                 mem_map[MAP_NR(page)].count++;
 783         } else {
 784                 if (pte_none(pte))
 785                         return 0;
 786                 flush_cache_page(vma, address);
 787                 pte_clear(ptep);
 788                 flush_tlb_page(vma, address);
 789                 if (!pte_present(pte)) {
 790                         swap_free(pte_val(pte));
 791                         return 0;
 792                 }
 793                 page = pte_page(pte);
 794                 if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
 795                         free_page(page);
 796                         return 0;
 797                 }
 798         }
 799         error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page);
 800         free_page(page);
 801         return error;
 802 }
 803 
 804 static inline int filemap_sync_pte_range(pmd_t * pmd,
     /* [previous][next][first][last][top][bottom][index][help] */
 805         unsigned long address, unsigned long size, 
 806         struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
 807 {
 808         pte_t * pte;
 809         unsigned long end;
 810         int error;
 811 
 812         if (pmd_none(*pmd))
 813                 return 0;
 814         if (pmd_bad(*pmd)) {
 815                 printk("filemap_sync_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
 816                 pmd_clear(pmd);
 817                 return 0;
 818         }
 819         pte = pte_offset(pmd, address);
 820         offset += address & PMD_MASK;
 821         address &= ~PMD_MASK;
 822         end = address + size;
 823         if (end > PMD_SIZE)
 824                 end = PMD_SIZE;
 825         error = 0;
 826         do {
 827                 error |= filemap_sync_pte(pte, vma, address + offset, flags);
 828                 address += PAGE_SIZE;
 829                 pte++;
 830         } while (address < end);
 831         return error;
 832 }
 833 
 834 static inline int filemap_sync_pmd_range(pgd_t * pgd,
     /* [previous][next][first][last][top][bottom][index][help] */
 835         unsigned long address, unsigned long size, 
 836         struct vm_area_struct *vma, unsigned int flags)
 837 {
 838         pmd_t * pmd;
 839         unsigned long offset, end;
 840         int error;
 841 
 842         if (pgd_none(*pgd))
 843                 return 0;
 844         if (pgd_bad(*pgd)) {
 845                 printk("filemap_sync_pmd_range: bad pgd (%08lx)\n", pgd_val(*pgd));
 846                 pgd_clear(pgd);
 847                 return 0;
 848         }
 849         pmd = pmd_offset(pgd, address);
 850         offset = address & PMD_MASK;
 851         address &= ~PMD_MASK;
 852         end = address + size;
 853         if (end > PGDIR_SIZE)
 854                 end = PGDIR_SIZE;
 855         error = 0;
 856         do {
 857                 error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
 858                 address = (address + PMD_SIZE) & PMD_MASK;
 859                 pmd++;
 860         } while (address < end);
 861         return error;
 862 }
 863 
 864 static int filemap_sync(struct vm_area_struct * vma, unsigned long address,
     /* [previous][next][first][last][top][bottom][index][help] */
 865         size_t size, unsigned int flags)
 866 {
 867         pgd_t * dir;
 868         unsigned long end = address + size;
 869         int error = 0;
 870 
 871         dir = pgd_offset(current->mm, address);
 872         flush_cache_range(vma->vm_mm, end - size, end);
 873         while (address < end) {
 874                 error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
 875                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 876                 dir++;
 877         }
 878         flush_tlb_range(vma->vm_mm, end - size, end);
 879         return error;
 880 }
 881 
 882 /*
 883  * This handles (potentially partial) area unmaps..
 884  */
 885 static void filemap_unmap(struct vm_area_struct *vma, unsigned long start, size_t len)
     /* [previous][next][first][last][top][bottom][index][help] */
 886 {
 887         filemap_sync(vma, start, len, MS_ASYNC);
 888 }
 889 
 890 /*
 891  * Shared mappings need to be able to do the right thing at
 892  * close/unmap/sync. They will also use the private file as
 893  * backing-store for swapping..
 894  */
 895 static struct vm_operations_struct file_shared_mmap = {
 896         NULL,                   /* no special open */
 897         NULL,                   /* no special close */
 898         filemap_unmap,          /* unmap - we need to sync the pages */
 899         NULL,                   /* no special protect */
 900         filemap_sync,           /* sync */
 901         NULL,                   /* advise */
 902         filemap_nopage,         /* nopage */
 903         NULL,                   /* wppage */
 904         filemap_swapout,        /* swapout */
 905         filemap_swapin,         /* swapin */
 906 };
 907 
 908 /*
 909  * Private mappings just need to be able to load in the map.
 910  *
 911  * (This is actually used for shared mappings as well, if we
 912  * know they can't ever get write permissions..)
 913  */
 914 static struct vm_operations_struct file_private_mmap = {
 915         NULL,                   /* open */
 916         NULL,                   /* close */
 917         NULL,                   /* unmap */
 918         NULL,                   /* protect */
 919         NULL,                   /* sync */
 920         NULL,                   /* advise */
 921         filemap_nopage,         /* nopage */
 922         NULL,                   /* wppage */
 923         NULL,                   /* swapout */
 924         NULL,                   /* swapin */
 925 };
 926 
 927 /* This is used for a general mmap of a disk file */
 928 int generic_file_mmap(struct inode * inode, struct file * file, struct vm_area_struct * vma)
     /* [previous][next][first][last][top][bottom][index][help] */
 929 {
 930         struct vm_operations_struct * ops;
 931 
 932         if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
 933                 ops = &file_shared_mmap;
 934                 /* share_page() can only guarantee proper page sharing if
 935                  * the offsets are all page aligned. */
 936                 if (vma->vm_offset & (PAGE_SIZE - 1))
 937                         return -EINVAL;
 938         } else {
 939                 ops = &file_private_mmap;
 940                 if (vma->vm_offset & (inode->i_sb->s_blocksize - 1))
 941                         return -EINVAL;
 942         }
 943         if (!inode->i_sb || !S_ISREG(inode->i_mode))
 944                 return -EACCES;
 945         if (!inode->i_op || !inode->i_op->readpage)
 946                 return -ENOEXEC;
 947         if (!IS_RDONLY(inode)) {
 948                 inode->i_atime = CURRENT_TIME;
 949                 inode->i_dirt = 1;
 950         }
 951         vma->vm_inode = inode;
 952         inode->i_count++;
 953         vma->vm_ops = ops;
 954         return 0;
 955 }
 956 
 957 
 958 /*
 959  * The msync() system call.
 960  */
 961 
 962 static int msync_interval(struct vm_area_struct * vma,
     /* [previous][next][first][last][top][bottom][index][help] */
 963         unsigned long start, unsigned long end, int flags)
 964 {
 965         if (!vma->vm_inode)
 966                 return 0;
 967         if (vma->vm_ops->sync) {
 968                 int error;
 969                 error = vma->vm_ops->sync(vma, start, end-start, flags);
 970                 if (error)
 971                         return error;
 972                 if (flags & MS_SYNC)
 973                         return file_fsync(vma->vm_inode, NULL);
 974                 return 0;
 975         }
 976         return 0;
 977 }
 978 
 979 asmlinkage int sys_msync(unsigned long start, size_t len, int flags)
     /* [previous][next][first][last][top][bottom][index][help] */
 980 {
 981         unsigned long end;
 982         struct vm_area_struct * vma;
 983         int unmapped_error, error;
 984 
 985         if (start & ~PAGE_MASK)
 986                 return -EINVAL;
 987         len = (len + ~PAGE_MASK) & PAGE_MASK;
 988         end = start + len;
 989         if (end < start)
 990                 return -EINVAL;
 991         if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
 992                 return -EINVAL;
 993         if (end == start)
 994                 return 0;
 995         /*
 996          * If the interval [start,end) covers some unmapped address ranges,
 997          * just ignore them, but return -EFAULT at the end.
 998          */
 999         vma = find_vma(current, start);
1000         unmapped_error = 0;
1001         for (;;) {
1002                 /* Still start < end. */
1003                 if (!vma)
1004                         return -EFAULT;
1005                 /* Here start < vma->vm_end. */
1006                 if (start < vma->vm_start) {
1007                         unmapped_error = -EFAULT;
1008                         start = vma->vm_start;
1009                 }
1010                 /* Here vma->vm_start <= start < vma->vm_end. */
1011                 if (end <= vma->vm_end) {
1012                         if (start < end) {
1013                                 error = msync_interval(vma, start, end, flags);
1014                                 if (error)
1015                                         return error;
1016                         }
1017                         return unmapped_error;
1018                 }
1019                 /* Here vma->vm_start <= start < vma->vm_end < end. */
1020                 error = msync_interval(vma, start, vma->vm_end, flags);
1021                 if (error)
1022                         return error;
1023                 start = vma->vm_end;
1024                 vma = vma->vm_next;
1025         }
1026 }

/* [previous][next][first][last][top][bottom][index][help] */