mm/filemap.c

/* */
This source file includes following definitions.
invalidate_inode_pages
truncate_inode_pages
shrink_mmap
page_unuse
update_vm_cache
add_to_page_cache
try_to_read_ahead
__wait_on_page
profile_readahead
generic_file_readahead
generic_file_read
fill_page
filemap_nopage
do_write_page
filemap_write_page
filemap_swapout
filemap_swapin
filemap_sync_pte
filemap_sync_pte_range
filemap_sync_pmd_range
filemap_sync
filemap_unmap
generic_file_mmap
msync_interval
sys_msync
   1 /*
   2  *      linux/mm/filemap.c
   3  *
   4  * Copyright (C) 1994, 1995  Linus Torvalds
   5  */
   6 
   7 /*
   8  * This file handles the generic file mmap semantics used by
   9  * most "normal" filesystems (but you don't /have/ to use this:
  10  * the NFS filesystem does this differently, for example)
  11  */
  12 #include <linux/stat.h>
  13 #include <linux/sched.h>
  14 #include <linux/kernel.h>
  15 #include <linux/mm.h>
  16 #include <linux/shm.h>
  17 #include <linux/errno.h>
  18 #include <linux/mman.h>
  19 #include <linux/string.h>
  20 #include <linux/malloc.h>
  21 #include <linux/fs.h>
  22 #include <linux/locks.h>
  23 #include <linux/pagemap.h>
  24 #include <linux/swap.h>
  25 
  26 #include <asm/segment.h>
  27 #include <asm/system.h>
  28 #include <asm/pgtable.h>
  29 
  30 /*
  31  * Shared mappings implemented 30.11.1994. It's not fully working yet,
  32  * though.
  33  *
  34  * Shared mappings now work. 15.8.1995  Bruno.
  35  */
  36 
  37 unsigned long page_cache_size = 0;
  38 struct page * page_hash_table[PAGE_HASH_SIZE];
  39 
  40 /*
  41  * Simple routines for both non-shared and shared mappings.
  42  */
  43 
  44 /*
  45  * Invalidate the pages of an inode, removing all pages that aren't
  46  * locked down (those are sure to be up-to-date anyway, so we shouldn't
  47  * invalidate them).
  48  */
  49 void invalidate_inode_pages(struct inode * inode)
     /*  */
  50 {
  51         struct page ** p;
  52         struct page * page;
  53 
  54         p = &inode->i_pages;
  55         while ((page = *p) != NULL) {
  56                 if (PageLocked(page)) {
  57                         p = &page->next;
  58                         continue;
  59                 }
  60                 inode->i_nrpages--;
  61                 if ((*p = page->next) != NULL)
  62                         (*p)->prev = page->prev;
  63                 page->dirty = 0;
  64                 page->next = NULL;
  65                 page->prev = NULL;
  66                 remove_page_from_hash_queue(page);
  67                 page->inode = NULL;
  68                 free_page(page_address(page));
  69                 continue;
  70         }
  71 }
  72 
  73 /*
  74  * Truncate the page cache at a set offset, removing the pages
  75  * that are beyond that offset (and zeroing out partial pages).
  76  */
  77 void truncate_inode_pages(struct inode * inode, unsigned long start)
     /*  */
  78 {
  79         struct page ** p;
  80         struct page * page;
  81 
  82 repeat:
  83         p = &inode->i_pages;
  84         while ((page = *p) != NULL) {
  85                 unsigned long offset = page->offset;
  86 
  87                 /* page wholly truncated - free it */
  88                 if (offset >= start) {
  89                         if (PageLocked(page)) {
  90                                 wait_on_page(page);
  91                                 goto repeat;
  92                         }
  93                         inode->i_nrpages--;
  94                         if ((*p = page->next) != NULL)
  95                                 (*p)->prev = page->prev;
  96                         page->dirty = 0;
  97                         page->next = NULL;
  98                         page->prev = NULL;
  99                         remove_page_from_hash_queue(page);
 100                         page->inode = NULL;
 101                         free_page(page_address(page));
 102                         continue;
 103                 }
 104                 p = &page->next;
 105                 offset = start - offset;
 106                 /* partial truncate, clear end of page */
 107                 if (offset < PAGE_SIZE) {
 108                         memset((void *) (offset + page_address(page)), 0, PAGE_SIZE - offset);
 109                         flush_page_to_ram(page_address(page));
 110                 }
 111         }
 112 }
 113 
 114 int shrink_mmap(int priority, int dma)
     /*  */
 115 {
 116         static int clock = 0;
 117         struct page * page;
 118         unsigned long limit = MAP_NR(high_memory);
 119         struct buffer_head *tmp, *bh;
 120 
 121         priority = (limit<<2) >> priority;
 122         page = mem_map + clock;
 123         do {
 124                 priority--;
 125                 if (PageLocked(page))
 126                         goto next;
 127                 if (dma && !PageDMA(page))
 128                         goto next;
 129                 /* First of all, regenerate the page's referenced bit
 130                    from any buffers in the page */
 131                 bh = page->buffers;
 132                 if (bh) {
 133                         tmp = bh;
 134                         do {
 135                                 if (buffer_touched(tmp)) {
 136                                         clear_bit(BH_Touched, &tmp->b_state);
 137                                         set_bit(PG_referenced, &page->flags);
 138                                 }
 139                                 tmp = tmp->b_this_page;
 140                         } while (tmp != bh);
 141                 }
 142 
 143                 /* We can't throw away shared pages, but we do mark
 144                    them as referenced.  This relies on the fact that
 145                    no page is currently in both the page cache and the
 146                    buffer cache; we'd have to modify the following
 147                    test to allow for that case. */
 148 
 149                 switch (page->count) {
 150                         case 1:
 151                                 /* If it has been referenced recently, don't free it */
 152                                 if (clear_bit(PG_referenced, &page->flags))
 153                                         break;
 154 
 155                                 /* is it a page cache page? */
 156                                 if (page->inode) {
 157                                         remove_page_from_hash_queue(page);
 158                                         remove_page_from_inode_queue(page);
 159                                         free_page(page_address(page));
 160                                         return 1;
 161                                 }
 162 
 163                                 /* is it a buffer cache page? */
 164                                 if (bh && try_to_free_buffer(bh, &bh, 6))
 165                                         return 1;
 166                                 break;
 167 
 168                         default:
 169                                 /* more than one users: we can't throw it away */
 170                                 set_bit(PG_referenced, &page->flags);
 171                                 /* fall through */
 172                         case 0:
 173                                 /* nothing */
 174                 }
 175 next:
 176                 page++;
 177                 clock++;
 178                 if (clock >= limit) {
 179                         clock = 0;
 180                         page = mem_map;
 181                 }
 182         } while (priority > 0);
 183         return 0;
 184 }
 185 
 186 /*
 187  * This is called from try_to_swap_out() when we try to get rid of some
 188  * pages..  If we're unmapping the last occurrence of this page, we also
 189  * free it from the page hash-queues etc, as we don't want to keep it
 190  * in-core unnecessarily.
 191  */
 192 unsigned long page_unuse(unsigned long page)
     /*  */
 193 {
 194         struct page * p = mem_map + MAP_NR(page);
 195         int count = p->count;
 196 
 197         if (count != 2)
 198                 return count;
 199         if (!p->inode)
 200                 return count;
 201         remove_page_from_hash_queue(p);
 202         remove_page_from_inode_queue(p);
 203         free_page(page);
 204         return 1;
 205 }
 206 
 207 /*
 208  * Update a page cache copy, when we're doing a "write()" system call
 209  * See also "update_vm_cache()".
 210  */
 211 void update_vm_cache(struct inode * inode, unsigned long pos, const char * buf, int count)
     /*  */
 212 {
 213         unsigned long offset, len;
 214 
 215         offset = (pos & ~PAGE_MASK);
 216         pos = pos & PAGE_MASK;
 217         len = PAGE_SIZE - offset;
 218         do {
 219                 struct page * page;
 220 
 221                 if (len > count)
 222                         len = count;
 223                 page = find_page(inode, pos);
 224                 if (page) {
 225                         unsigned long addr;
 226 
 227                         wait_on_page(page);
 228                         addr = page_address(page);
 229                         memcpy((void *) (offset + addr), buf, len);
 230                         free_page(addr);
 231                 }
 232                 count -= len;
 233                 buf += len;
 234                 len = PAGE_SIZE;
 235                 offset = 0;
 236                 pos += PAGE_SIZE;
 237         } while (count);
 238 }
 239 
 240 static inline void add_to_page_cache(struct page * page,
     /*  */
 241         struct inode * inode, unsigned long offset)
 242 {
 243         page->count++;
 244         page->flags &= ~((1 << PG_uptodate) | (1 << PG_error));
 245         page->offset = offset;
 246         add_page_to_inode_queue(inode, page);
 247         add_page_to_hash_queue(inode, page);
 248 }
 249 
 250 /*
 251  * Try to read ahead in the file. "page_cache" is a potentially free page
 252  * that we could use for the cache (if it is 0 we can try to create one,
 253  * this is all overlapped with the IO on the previous page finishing anyway)
 254  */
 255 static unsigned long try_to_read_ahead(struct inode * inode, unsigned long offset, unsigned long page_cache)
     /*  */
 256 {
 257         struct page * page;
 258 
 259         offset &= PAGE_MASK;
 260         if (!page_cache) {
 261                 page_cache = __get_free_page(GFP_KERNEL);
 262                 if (!page_cache)
 263                         return 0;
 264         }
 265         if (offset >= inode->i_size)
 266                 return page_cache;
 267 #if 1
 268         page = find_page(inode, offset);
 269         if (page) {
 270                 page->count--;
 271                 return page_cache;
 272         }
 273         /*
 274          * Ok, add the new page to the hash-queues...
 275          */
 276         page = mem_map + MAP_NR(page_cache);
 277         add_to_page_cache(page, inode, offset);
 278         inode->i_op->readpage(inode, page);
 279         free_page(page_cache);
 280         return 0;
 281 #else
 282         return page_cache;
 283 #endif
 284 }
 285 
 286 /* 
 287  * Wait for IO to complete on a locked page.
 288  */
 289 void __wait_on_page(struct page *page)
     /*  */
 290 {
 291         struct wait_queue wait = { current, NULL };
 292 
 293         page->count++;
 294         add_wait_queue(&page->wait, &wait);
 295 repeat:
 296         run_task_queue(&tq_disk);
 297         current->state = TASK_UNINTERRUPTIBLE;
 298         if (PageLocked(page)) {
 299                 schedule();
 300                 goto repeat;
 301         }
 302         remove_wait_queue(&page->wait, &wait);
 303         page->count--;
 304         current->state = TASK_RUNNING;
 305 }
 306 
 307 #if 0
 308 #define PROFILE_READAHEAD
 309 #define DEBUG_READAHEAD
 310 #endif
 311 
 312 /*
 313  * Read-ahead profiling informations
 314  * ---------------------------------
 315  * Every PROFILE_MAXREADCOUNT, the following informations are written 
 316  * to the syslog:
 317  *   Percentage of asynchronous read-ahead.
 318  *   Average of read-ahead fields context value.
 319  * If DEBUG_READAHEAD is defined, a snapshot of these fields is written 
 320  * to the syslog.
 321  */
 322 
 323 #ifdef PROFILE_READAHEAD
 324 
 325 #define PROFILE_MAXREADCOUNT 1000
 326 
 327 static unsigned long total_reada;
 328 static unsigned long total_async;
 329 static unsigned long total_ramax;
 330 static unsigned long total_ralen;
 331 static unsigned long total_rawin;
 332 
 333 static void profile_readahead(int async, struct file *filp)
     /*  */
 334 {
 335         unsigned long flags;
 336 
 337         ++total_reada;
 338         if (async)
 339                 ++total_async;
 340 
 341         total_ramax     += filp->f_ramax;
 342         total_ralen     += filp->f_ralen;
 343         total_rawin     += filp->f_rawin;
 344 
 345         if (total_reada > PROFILE_MAXREADCOUNT) {
 346                 save_flags(flags);
 347                 cli();
 348                 if (!(total_reada > PROFILE_MAXREADCOUNT)) {
 349                         restore_flags(flags);
 350                         return;
 351                 }
 352 
 353                 printk("Readahead average:  max=%ld, len=%ld, win=%ld, async=%ld%%\n",
 354                         total_ramax/total_reada,
 355                         total_ralen/total_reada,
 356                         total_rawin/total_reada,
 357                         (total_async*100)/total_reada);
 358 #ifdef DEBUG_READAHEAD
 359                 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, rapos=%ld\n",
 360                         filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_rapos);
 361 #endif
 362 
 363                 total_reada     = 0;
 364                 total_async     = 0;
 365                 total_ramax     = 0;
 366                 total_ralen     = 0;
 367                 total_rawin     = 0;
 368 
 369                 restore_flags(flags);
 370         }
 371 }
 372 #endif  /* defined PROFILE_READAHEAD */
 373 
 374 /*
 375  * Read-ahead context:
 376  * -------------------
 377  * The read ahead context fields of the "struct file" are the following:
 378  * - f_rapos : position of the first byte after the last page we tried to
 379  *             read ahead.
 380  * - f_ramax : current read-ahead maximum size.
 381  * - f_ralen : length of the current IO read block we tried to read-ahead.
 382  * - f_rawin : length of the current read-ahead window.
 383  *             if last read-ahead was synchronous then
 384  *                  f_rawin = f_ralen
 385  *             otherwise (was asynchronous)
 386  *                  f_rawin = previous value of f_ralen + f_ralen
 387  *
 388  * Read-ahead limits:
 389  * ------------------
 390  * MIN_READAHEAD   : minimum read-ahead size when read-ahead.
 391  * MAX_READAHEAD   : maximum read-ahead size when read-ahead.
 392  * MAX_READWINDOW  : maximum read window length.
 393  *
 394  * Synchronous read-ahead benefits:
 395  * --------------------------------
 396  * Using reasonable IO xfer length from peripheral devices increase system 
 397  * performances.
 398  * Reasonable means, in this context, not too large but not too small.
 399  * The actual maximum value is MAX_READAHEAD + PAGE_SIZE = 32k
 400  *
 401  * Asynchronous read-ahead benefits:
 402  * ---------------------------------
 403  * Overlapping next read request and user process execution increase system 
 404  * performance.
 405  *
 406  * Read-ahead risks:
 407  * -----------------
 408  * We have to guess which further data are needed by the user process.
 409  * If these data are often not really needed, it's bad for system 
 410  * performances.
 411  * However, we know that files are often accessed sequentially by 
 412  * application programs and it seems that it is possible to have some good 
 413  * strategy in that guessing.
 414  * We only try to read-ahead files that seems to be read sequentially.
 415  *
 416  * Asynchronous read-ahead risks:
 417  * ------------------------------
 418  * In order to maximize overlapping, we must start some asynchronous read 
 419  * request from the device, as soon as possible.
 420  * We must be very careful about:
 421  * - The number of effective pending IO read requests.
 422  *   ONE seems to be the only reasonable value.
 423  * - The total memory pool usage for the file access stream.
 424  *   We try to have a limit of MAX_READWINDOW = 48K.
 425  */
 426 
 427 #define MAX_READWINDOW (PAGE_SIZE*32)
 428 #define MAX_READAHEAD (PAGE_SIZE*16)
 429 #define MIN_READAHEAD (PAGE_SIZE)
 430 
 431 static inline unsigned long generic_file_readahead(struct file * filp, struct inode * inode,
     /*  */
 432         int try_async, unsigned long pos, struct page * page,
 433         unsigned long page_cache)
 434 {
 435         unsigned long max_ahead, ahead;
 436         unsigned long rapos, ppos;
 437 
 438         ppos = pos & PAGE_MASK;
 439         rapos = filp->f_rapos & PAGE_MASK;
 440         max_ahead = 0;
 441 
 442 /*
 443  * If the current page is locked, and if the current position is outside the
 444  * previous read IO request, try some synchronous read-ahead in order
 445  * to avoid too small IO requests.
 446  */
 447         if (PageLocked(page)) {
 448                 if (!rapos || ppos >= rapos || ppos + filp->f_ralen < rapos) {
 449                         rapos = ppos;
 450                         if (rapos < inode->i_size)
 451                                 max_ahead = filp->f_ramax;
 452                         filp->f_rawin = 0;
 453                         filp->f_ralen = PAGE_SIZE;
 454                 }
 455         }
 456 /*
 457  * The current page is not locked
 458  * If the current position is inside the last read-ahead IO request,
 459  * it is the moment to try asynchronous read-ahead.
 460  * try_async = 2 means that we have to force unplug of the device in
 461  * order to force read IO asynchronously.
 462  */
 463         else if (try_async == 1 && rapos >= PAGE_SIZE &&
 464                  ppos <= rapos && ppos + filp->f_ralen >= rapos) {
 465 /*
 466  * Add ONE page to max_ahead in order to try to have about the same IO max size
 467  * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_SIZE.
 468  * Compute the position of the last page we have tried to read.
 469  */
 470                 rapos -= PAGE_SIZE;
 471                 if (rapos < inode->i_size)
 472                         max_ahead = filp->f_ramax + PAGE_SIZE;
 473 
 474                 if (max_ahead) {
 475                         filp->f_rawin = filp->f_ralen;
 476                         filp->f_ralen = 0;
 477                         try_async = 2;
 478                 }
 479         }
 480 /*
 481  * Try to read pages.
 482  * We hope that ll_rw_blk() plug/unplug, coalescence and sort will work fine
 483  * enough to avoid too bad actuals IO requests.
 484  */
 485         ahead = 0;
 486         while (ahead < max_ahead) {
 487                 ahead += PAGE_SIZE;
 488                 page_cache = try_to_read_ahead(inode, rapos + ahead, page_cache);
 489         }
 490 /*
 491  * If we tried to read some pages,
 492  * Update the read-ahead context.
 493  * Store the length of the current read-ahead window.
 494  * Add PAGE_SIZE to the max read ahead size each time we have read-ahead
 495  *   That recipe avoid to do some large IO for files that are not really
 496  *   accessed sequentially.
 497  * Do that only if the read ahead window is lower that MAX_READWINDOW
 498  * in order to limit the amount of pages used for this file access context.
 499  * If asynchronous,
 500  *    Try to force unplug of the device in order to start an asynchronous
 501  *    read IO request.
 502  */
 503         if (ahead) {
 504                 filp->f_ralen += ahead;
 505                 filp->f_rawin += filp->f_ralen;
 506                 filp->f_rapos = rapos + ahead + PAGE_SIZE;
 507 
 508                 if (filp->f_rawin < MAX_READWINDOW)
 509                         filp->f_ramax += PAGE_SIZE;
 510                 else if (filp->f_rawin > MAX_READWINDOW && filp->f_ramax > PAGE_SIZE)
 511                         filp->f_ramax -= PAGE_SIZE;
 512 
 513                 if (filp->f_ramax > MAX_READAHEAD)
 514                         filp->f_ramax = MAX_READAHEAD;
 515 #ifdef PROFILE_READAHEAD
 516                 profile_readahead((try_async == 2), filp);
 517 #endif
 518                 if (try_async == 2) {
 519                         run_task_queue(&tq_disk);
 520                 }
 521         }
 522 /*
 523  * Wait on the page if necessary
 524  */
 525         if (PageLocked(page)) {
 526                 __wait_on_page(page);
 527         }
 528         return page_cache;
 529 }
 530 
 531 
 532 /*
 533  * This is a generic file read routine, and uses the
 534  * inode->i_op->readpage() function for the actual low-level
 535  * stuff.
 536  *
 537  * This is really ugly. But the goto's actually try to clarify some
 538  * of the logic when it comes to error handling etc.
 539  */
 540 
 541 int generic_file_read(struct inode * inode, struct file * filp, char * buf, int count)
     /*  */
 542 {
 543         int error, read;
 544         unsigned long pos, ppos, page_cache;
 545         int try_async;
 546 
 547         if (count <= 0)
 548                 return 0;
 549 
 550         error = 0;
 551         read = 0;
 552         page_cache = 0;
 553 
 554         pos = filp->f_pos;
 555         ppos = pos & PAGE_MASK;
 556 /*
 557  * Check if the current position is inside the previous read-ahead window.
 558  * If that's true, We assume that the file accesses are sequential enough to
 559  * continue asynchronous read-ahead.
 560  * Do minimum read-ahead at the beginning of the file since some tools
 561  * only read the beginning of files.
 562  * Break read-ahead if the file position is outside the previous read ahead
 563  * window or if read-ahead position is 0.
 564  */
 565 /*
 566  * Will not try asynchronous read-ahead.
 567  * Reset to zero, read-ahead context.
 568  */
 569         if (pos+count < MIN_READAHEAD || !filp->f_rapos ||
 570             ppos > filp->f_rapos || ppos + filp->f_rawin < filp->f_rapos) {
 571                 try_async = 0;
 572                 filp->f_rapos = 0;
 573                 filp->f_ralen = 0;
 574                 filp->f_ramax = 0;
 575                 filp->f_rawin = 0;
 576 /*
 577  * Will try asynchronous read-ahead.
 578  */
 579         } else {
 580                 try_async = 1;
 581         }
 582 /*
 583  * Adjust the current value of read-ahead max.
 584  * If the read operation stay in the first half page, force no readahead.
 585  * Otherwise try first some value near count.
 586  *      do at least MIN_READAHEAD and at most MAX_READAHEAD.
 587  */
 588         if (pos + count <= (PAGE_SIZE >> 1)) {
 589                 try_async = 0;
 590                 filp->f_ramax = 0;
 591         } else {
 592                 if (filp->f_ramax < count)
 593                         filp->f_ramax = count & PAGE_MASK;
 594 
 595                 if (filp->f_ramax < MIN_READAHEAD)
 596                         filp->f_ramax = MIN_READAHEAD;
 597                 else if (filp->f_ramax > MAX_READAHEAD)
 598                         filp->f_ramax = MAX_READAHEAD;
 599         }
 600 
 601         for (;;) {
 602                 struct page *page;
 603                 unsigned long offset, addr, nr;
 604 
 605                 if (pos >= inode->i_size)
 606                         break;
 607                 offset = pos & ~PAGE_MASK;
 608                 nr = PAGE_SIZE - offset;
 609                 /*
 610                  * Try to find the data in the page cache..
 611                  */
 612                 page = find_page(inode, pos & PAGE_MASK);
 613                 if (page)
 614                         goto found_page;
 615 
 616                 /*
 617                  * Ok, it wasn't cached, so we need to create a new
 618                  * page..
 619                  */
 620                 if (page_cache)
 621                         goto new_page;
 622 
 623                 error = -ENOMEM;
 624                 page_cache = __get_free_page(GFP_KERNEL);
 625                 if (!page_cache)
 626                         break;
 627                 error = 0;
 628 
 629                 /*
 630                  * That could have slept, so we need to check again..
 631                  */
 632                 if (pos >= inode->i_size)
 633                         break;
 634                 page = find_page(inode, pos & PAGE_MASK);
 635                 if (!page)
 636                         goto new_page;
 637 
 638 found_page:
 639                 addr = page_address(page);
 640                 if (nr > count)
 641                         nr = count;
 642 /*
 643  * Do not try to readahead if the current page is not filled or being filled.
 644  * If our goal was to try asynchronous read-ahead, we were quite wrong.
 645  * Set max readahead to some shorter value in order to fix a little
 646  * this mistake.
 647  */
 648                 if (PageUptodate(page) || PageLocked(page))
 649                         page_cache = generic_file_readahead(filp, inode, try_async, pos, page, page_cache);
 650                 else if (try_async) {
 651                         if (filp->f_ramax > MIN_READAHEAD)
 652                                 filp->f_ramax -= PAGE_SIZE;
 653                 }
 654 
 655                 if (!PageUptodate(page))
 656                         goto read_page;
 657                 if (nr > inode->i_size - pos)
 658                         nr = inode->i_size - pos;
 659                 memcpy_tofs(buf, (void *) (addr + offset), nr);
 660                 free_page(addr);
 661                 buf += nr;
 662                 pos += nr;
 663                 read += nr;
 664                 count -= nr;
 665                 if (count)
 666                         continue;
 667                 break;
 668         
 669 
 670 new_page:
 671                 /*
 672                  * Ok, add the new page to the hash-queues...
 673                  */
 674                 addr = page_cache;
 675                 page = mem_map + MAP_NR(page_cache);
 676                 page_cache = 0;
 677                 add_to_page_cache(page, inode, pos & PAGE_MASK);
 678 
 679                 /*
 680                  * Error handling is tricky. If we get a read error,
 681                  * the cached page stays in the cache (but uptodate=0),
 682                  * and the next process that accesses it will try to
 683                  * re-read it. This is needed for NFS etc, where the
 684                  * identity of the reader can decide if we can read the
 685                  * page or not..
 686                  */
 687 read_page:
 688                 error = inode->i_op->readpage(inode, page);
 689                 if (!error) {
 690                         if (!PageError(page))
 691                                 goto found_page;
 692                         error = -EIO;
 693                 }
 694                 free_page(addr);
 695                 break;
 696         }
 697 
 698         filp->f_pos = pos;
 699         filp->f_reada = 1;
 700         if (page_cache)
 701                 free_page(page_cache);
 702         if (!IS_RDONLY(inode)) {
 703                 inode->i_atime = CURRENT_TIME;
 704                 inode->i_dirt = 1;
 705         }
 706         if (!read)
 707                 read = error;
 708         return read;
 709 }
 710 
 711 /*
 712  * Find a cached page and wait for it to become up-to-date, return
 713  * the page address.  Increments the page count.
 714  */
 715 static inline unsigned long fill_page(struct inode * inode, unsigned long offset)
     /*  */
 716 {
 717         struct page * page;
 718         unsigned long new_page;
 719 
 720         page = find_page(inode, offset);
 721         if (page)
 722                 goto found_page_dont_free;
 723         new_page = __get_free_page(GFP_KERNEL);
 724         page = find_page(inode, offset);
 725         if (page)
 726                 goto found_page;
 727         if (!new_page)
 728                 return 0;
 729         page = mem_map + MAP_NR(new_page);
 730         new_page = 0;
 731         add_to_page_cache(page, inode, offset);
 732         inode->i_op->readpage(inode, page);
 733         if (PageLocked(page))
 734                 new_page = try_to_read_ahead(inode, offset + PAGE_SIZE, 0);
 735 found_page:
 736         if (new_page)
 737                 free_page(new_page);
 738 found_page_dont_free:
 739         wait_on_page(page);
 740         return page_address(page);
 741 }
 742 
 743 /*
 744  * Semantics for shared and private memory areas are different past the end
 745  * of the file. A shared mapping past the last page of the file is an error
 746  * and results in a SIGBUS, while a private mapping just maps in a zero page.
 747  */
 748 static unsigned long filemap_nopage(struct vm_area_struct * area, unsigned long address, int no_share)
     /*  */
 749 {
 750         unsigned long offset;
 751         struct inode * inode = area->vm_inode;
 752         unsigned long page;
 753 
 754         offset = (address & PAGE_MASK) - area->vm_start + area->vm_offset;
 755         if (offset >= inode->i_size && (area->vm_flags & VM_SHARED) && area->vm_mm == current->mm)
 756                 return 0;
 757 
 758         page = fill_page(inode, offset);
 759         if (page && no_share) {
 760                 unsigned long new_page = __get_free_page(GFP_KERNEL);
 761                 if (new_page) {
 762                         memcpy((void *) new_page, (void *) page, PAGE_SIZE);
 763                         flush_page_to_ram(new_page);
 764                 }
 765                 free_page(page);
 766                 return new_page;
 767         }
 768         flush_page_to_ram(page);
 769         return page;
 770 }
 771 
 772 /*
 773  * Tries to write a shared mapped page to its backing store. May return -EIO
 774  * if the disk is full.
 775  */
 776 static inline int do_write_page(struct inode * inode, struct file * file,
     /*  */
 777         const char * page, unsigned long offset)
 778 {
 779         int old_fs, retval;
 780         unsigned long size;
 781 
 782         size = offset + PAGE_SIZE;
 783         /* refuse to extend file size.. */
 784         if (S_ISREG(inode->i_mode)) {
 785                 if (size > inode->i_size)
 786                         size = inode->i_size;
 787                 /* Ho humm.. We should have tested for this earlier */
 788                 if (size < offset)
 789                         return -EIO;
 790         }
 791         size -= offset;
 792         old_fs = get_fs();
 793         set_fs(KERNEL_DS);
 794         retval = -EIO;
 795         if (size == file->f_op->write(inode, file, (const char *) page, size))
 796                 retval = 0;
 797         set_fs(old_fs);
 798         return retval;
 799 }
 800 
 801 static int filemap_write_page(struct vm_area_struct * vma,
     /*  */
 802         unsigned long offset,
 803         unsigned long page)
 804 {
 805         int result;
 806         struct file file;
 807         struct inode * inode;
 808         struct buffer_head * bh;
 809 
 810         bh = mem_map[MAP_NR(page)].buffers;
 811         if (bh) {
 812                 /* whee.. just mark the buffer heads dirty */
 813                 struct buffer_head * tmp = bh;
 814                 do {
 815                         mark_buffer_dirty(tmp, 0);
 816                         tmp = tmp->b_this_page;
 817                 } while (tmp != bh);
 818                 return 0;
 819         }
 820 
 821         inode = vma->vm_inode;
 822         file.f_op = inode->i_op->default_file_ops;
 823         if (!file.f_op->write)
 824                 return -EIO;
 825         file.f_mode = 3;
 826         file.f_flags = 0;
 827         file.f_count = 1;
 828         file.f_inode = inode;
 829         file.f_pos = offset;
 830         file.f_reada = 0;
 831 
 832         down(&inode->i_sem);
 833         result = do_write_page(inode, &file, (const char *) page, offset);
 834         up(&inode->i_sem);
 835         return result;
 836 }
 837 
 838 
 839 /*
 840  * Swapping to a shared file: while we're busy writing out the page
 841  * (and the page still exists in memory), we save the page information
 842  * in the page table, so that "filemap_swapin()" can re-use the page
 843  * immediately if it is called while we're busy swapping it out..
 844  *
 845  * Once we've written it all out, we mark the page entry "empty", which
 846  * will result in a normal page-in (instead of a swap-in) from the now
 847  * up-to-date disk file.
 848  */
 849 int filemap_swapout(struct vm_area_struct * vma,
     /*  */
 850         unsigned long offset,
 851         pte_t *page_table)
 852 {
 853         int error;
 854         unsigned long page = pte_page(*page_table);
 855         unsigned long entry = SWP_ENTRY(SHM_SWP_TYPE, MAP_NR(page));
 856 
 857         flush_cache_page(vma, (offset + vma->vm_start - vma->vm_offset));
 858         set_pte(page_table, __pte(entry));
 859         flush_tlb_page(vma, (offset + vma->vm_start - vma->vm_offset));
 860         error = filemap_write_page(vma, offset, page);
 861         if (pte_val(*page_table) == entry)
 862                 pte_clear(page_table);
 863         return error;
 864 }
 865 
 866 /*
 867  * filemap_swapin() is called only if we have something in the page
 868  * tables that is non-zero (but not present), which we know to be the
 869  * page index of a page that is busy being swapped out (see above).
 870  * So we just use it directly..
 871  */
 872 static pte_t filemap_swapin(struct vm_area_struct * vma,
     /*  */
 873         unsigned long offset,
 874         unsigned long entry)
 875 {
 876         unsigned long page = SWP_OFFSET(entry);
 877 
 878         mem_map[page].count++;
 879         page = (page << PAGE_SHIFT) + PAGE_OFFSET;
 880         return mk_pte(page,vma->vm_page_prot);
 881 }
 882 
 883 
 884 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
     /*  */
 885         unsigned long address, unsigned int flags)
 886 {
 887         pte_t pte = *ptep;
 888         unsigned long page;
 889         int error;
 890 
 891         if (!(flags & MS_INVALIDATE)) {
 892                 if (!pte_present(pte))
 893                         return 0;
 894                 if (!pte_dirty(pte))
 895                         return 0;
 896                 flush_page_to_ram(pte_page(pte));
 897                 flush_cache_page(vma, address);
 898                 set_pte(ptep, pte_mkclean(pte));
 899                 flush_tlb_page(vma, address);
 900                 page = pte_page(pte);
 901                 mem_map[MAP_NR(page)].count++;
 902         } else {
 903                 if (pte_none(pte))
 904                         return 0;
 905                 flush_cache_page(vma, address);
 906                 pte_clear(ptep);
 907                 flush_tlb_page(vma, address);
 908                 if (!pte_present(pte)) {
 909                         swap_free(pte_val(pte));
 910                         return 0;
 911                 }
 912                 page = pte_page(pte);
 913                 if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
 914                         free_page(page);
 915                         return 0;
 916                 }
 917         }
 918         error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page);
 919         free_page(page);
 920         return error;
 921 }
 922 
 923 static inline int filemap_sync_pte_range(pmd_t * pmd,
     /*  */
 924         unsigned long address, unsigned long size, 
 925         struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
 926 {
 927         pte_t * pte;
 928         unsigned long end;
 929         int error;
 930 
 931         if (pmd_none(*pmd))
 932                 return 0;
 933         if (pmd_bad(*pmd)) {
 934                 printk("filemap_sync_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
 935                 pmd_clear(pmd);
 936                 return 0;
 937         }
 938         pte = pte_offset(pmd, address);
 939         offset += address & PMD_MASK;
 940         address &= ~PMD_MASK;
 941         end = address + size;
 942         if (end > PMD_SIZE)
 943                 end = PMD_SIZE;
 944         error = 0;
 945         do {
 946                 error |= filemap_sync_pte(pte, vma, address + offset, flags);
 947                 address += PAGE_SIZE;
 948                 pte++;
 949         } while (address < end);
 950         return error;
 951 }
 952 
 953 static inline int filemap_sync_pmd_range(pgd_t * pgd,
     /*  */
 954         unsigned long address, unsigned long size, 
 955         struct vm_area_struct *vma, unsigned int flags)
 956 {
 957         pmd_t * pmd;
 958         unsigned long offset, end;
 959         int error;
 960 
 961         if (pgd_none(*pgd))
 962                 return 0;
 963         if (pgd_bad(*pgd)) {
 964                 printk("filemap_sync_pmd_range: bad pgd (%08lx)\n", pgd_val(*pgd));
 965                 pgd_clear(pgd);
 966                 return 0;
 967         }
 968         pmd = pmd_offset(pgd, address);
 969         offset = address & PGDIR_MASK;
 970         address &= ~PGDIR_MASK;
 971         end = address + size;
 972         if (end > PGDIR_SIZE)
 973                 end = PGDIR_SIZE;
 974         error = 0;
 975         do {
 976                 error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
 977                 address = (address + PMD_SIZE) & PMD_MASK;
 978                 pmd++;
 979         } while (address < end);
 980         return error;
 981 }
 982 
 983 static int filemap_sync(struct vm_area_struct * vma, unsigned long address,
     /*  */
 984         size_t size, unsigned int flags)
 985 {
 986         pgd_t * dir;
 987         unsigned long end = address + size;
 988         int error = 0;
 989 
 990         dir = pgd_offset(current->mm, address);
 991         flush_cache_range(vma->vm_mm, end - size, end);
 992         while (address < end) {
 993                 error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
 994                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 995                 dir++;
 996         }
 997         flush_tlb_range(vma->vm_mm, end - size, end);
 998         return error;
 999 }
1000 
1001 /*
1002  * This handles (potentially partial) area unmaps..
1003  */
1004 static void filemap_unmap(struct vm_area_struct *vma, unsigned long start, size_t len)
     /*  */
1005 {
1006         filemap_sync(vma, start, len, MS_ASYNC);
1007 }
1008 
1009 /*
1010  * Shared mappings need to be able to do the right thing at
1011  * close/unmap/sync. They will also use the private file as
1012  * backing-store for swapping..
1013  */
1014 static struct vm_operations_struct file_shared_mmap = {
1015         NULL,                   /* no special open */
1016         NULL,                   /* no special close */
1017         filemap_unmap,          /* unmap - we need to sync the pages */
1018         NULL,                   /* no special protect */
1019         filemap_sync,           /* sync */
1020         NULL,                   /* advise */
1021         filemap_nopage,         /* nopage */
1022         NULL,                   /* wppage */
1023         filemap_swapout,        /* swapout */
1024         filemap_swapin,         /* swapin */
1025 };
1026 
1027 /*
1028  * Private mappings just need to be able to load in the map.
1029  *
1030  * (This is actually used for shared mappings as well, if we
1031  * know they can't ever get write permissions..)
1032  */
1033 static struct vm_operations_struct file_private_mmap = {
1034         NULL,                   /* open */
1035         NULL,                   /* close */
1036         NULL,                   /* unmap */
1037         NULL,                   /* protect */
1038         NULL,                   /* sync */
1039         NULL,                   /* advise */
1040         filemap_nopage,         /* nopage */
1041         NULL,                   /* wppage */
1042         NULL,                   /* swapout */
1043         NULL,                   /* swapin */
1044 };
1045 
1046 /* This is used for a general mmap of a disk file */
1047 int generic_file_mmap(struct inode * inode, struct file * file, struct vm_area_struct * vma)
     /*  */
1048 {
1049         struct vm_operations_struct * ops;
1050 
1051         if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
1052                 ops = &file_shared_mmap;
1053                 /* share_page() can only guarantee proper page sharing if
1054                  * the offsets are all page aligned. */
1055                 if (vma->vm_offset & (PAGE_SIZE - 1))
1056                         return -EINVAL;
1057         } else {
1058                 ops = &file_private_mmap;
1059                 if (vma->vm_offset & (inode->i_sb->s_blocksize - 1))
1060                         return -EINVAL;
1061         }
1062         if (!inode->i_sb || !S_ISREG(inode->i_mode))
1063                 return -EACCES;
1064         if (!inode->i_op || !inode->i_op->readpage)
1065                 return -ENOEXEC;
1066         if (!IS_RDONLY(inode)) {
1067                 inode->i_atime = CURRENT_TIME;
1068                 inode->i_dirt = 1;
1069         }
1070         vma->vm_inode = inode;
1071         inode->i_count++;
1072         vma->vm_ops = ops;
1073         return 0;
1074 }
1075 
1076 
1077 /*
1078  * The msync() system call.
1079  */
1080 
1081 static int msync_interval(struct vm_area_struct * vma,
     /*  */
1082         unsigned long start, unsigned long end, int flags)
1083 {
1084         if (!vma->vm_inode)
1085                 return 0;
1086         if (vma->vm_ops->sync) {
1087                 int error;
1088                 error = vma->vm_ops->sync(vma, start, end-start, flags);
1089                 if (error)
1090                         return error;
1091                 if (flags & MS_SYNC)
1092                         return file_fsync(vma->vm_inode, NULL);
1093                 return 0;
1094         }
1095         return 0;
1096 }
1097 
1098 asmlinkage int sys_msync(unsigned long start, size_t len, int flags)
     /*  */
1099 {
1100         unsigned long end;
1101         struct vm_area_struct * vma;
1102         int unmapped_error, error;
1103 
1104         if (start & ~PAGE_MASK)
1105                 return -EINVAL;
1106         len = (len + ~PAGE_MASK) & PAGE_MASK;
1107         end = start + len;
1108         if (end < start)
1109                 return -EINVAL;
1110         if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
1111                 return -EINVAL;
1112         if (end == start)
1113                 return 0;
1114         /*
1115          * If the interval [start,end) covers some unmapped address ranges,
1116          * just ignore them, but return -EFAULT at the end.
1117          */
1118         vma = find_vma(current, start);
1119         unmapped_error = 0;
1120         for (;;) {
1121                 /* Still start < end. */
1122                 if (!vma)
1123                         return -EFAULT;
1124                 /* Here start < vma->vm_end. */
1125                 if (start < vma->vm_start) {
1126                         unmapped_error = -EFAULT;
1127                         start = vma->vm_start;
1128                 }
1129                 /* Here vma->vm_start <= start < vma->vm_end. */
1130                 if (end <= vma->vm_end) {
1131                         if (start < end) {
1132                                 error = msync_interval(vma, start, end, flags);
1133                                 if (error)
1134                                         return error;
1135                         }
1136                         return unmapped_error;
1137                 }
1138                 /* Here vma->vm_start <= start < vma->vm_end < end. */
1139                 error = msync_interval(vma, start, vma->vm_end, flags);
1140                 if (error)
1141                         return error;
1142                 start = vma->vm_end;
1143                 vma = vma->vm_next;
1144         }
1145 }
/* */
root/mm/filemap.c

DEFINITIONS