mm/filemap.c

/* */
This source file includes following definitions.
invalidate_inode_pages
truncate_inode_pages
shrink_mmap
page_unuse
update_vm_cache
add_to_page_cache
try_to_read_ahead
__wait_on_page
profile_readahead
generic_file_readahead
generic_file_read
fill_page
filemap_nopage
do_write_page
filemap_write_page
filemap_swapout
filemap_swapin
filemap_sync_pte
filemap_sync_pte_range
filemap_sync_pmd_range
filemap_sync
filemap_unmap
generic_file_mmap
msync_interval
sys_msync
   1 /*
   2  *      linux/mm/filemap.c
   3  *
   4  * Copyright (C) 1994, 1995  Linus Torvalds
   5  */
   6 
   7 /*
   8  * This file handles the generic file mmap semantics used by
   9  * most "normal" filesystems (but you don't /have/ to use this:
  10  * the NFS filesystem does this differently, for example)
  11  */
  12 #include <linux/stat.h>
  13 #include <linux/sched.h>
  14 #include <linux/kernel.h>
  15 #include <linux/mm.h>
  16 #include <linux/shm.h>
  17 #include <linux/errno.h>
  18 #include <linux/mman.h>
  19 #include <linux/string.h>
  20 #include <linux/malloc.h>
  21 #include <linux/fs.h>
  22 #include <linux/locks.h>
  23 #include <linux/pagemap.h>
  24 #include <linux/swap.h>
  25 
  26 #include <asm/segment.h>
  27 #include <asm/system.h>
  28 #include <asm/pgtable.h>
  29 
  30 /*
  31  * Shared mappings implemented 30.11.1994. It's not fully working yet,
  32  * though.
  33  *
  34  * Shared mappings now work. 15.8.1995  Bruno.
  35  */
  36 
  37 unsigned long page_cache_size = 0;
  38 struct page * page_hash_table[PAGE_HASH_SIZE];
  39 
  40 /*
  41  * Simple routines for both non-shared and shared mappings.
  42  */
  43 
  44 /*
  45  * Invalidate the pages of an inode, removing all pages that aren't
  46  * locked down (those are sure to be up-to-date anyway, so we shouldn't
  47  * invalidate them).
  48  */
  49 void invalidate_inode_pages(struct inode * inode)
     /*  */
  50 {
  51         struct page ** p;
  52         struct page * page;
  53 
  54         p = &inode->i_pages;
  55         while ((page = *p) != NULL) {
  56                 if (PageLocked(page)) {
  57                         p = &page->next;
  58                         continue;
  59                 }
  60                 inode->i_nrpages--;
  61                 if ((*p = page->next) != NULL)
  62                         (*p)->prev = page->prev;
  63                 page->dirty = 0;
  64                 page->next = NULL;
  65                 page->prev = NULL;
  66                 remove_page_from_hash_queue(page);
  67                 page->inode = NULL;
  68                 free_page(page_address(page));
  69                 continue;
  70         }
  71 }
  72 
  73 /*
  74  * Truncate the page cache at a set offset, removing the pages
  75  * that are beyond that offset (and zeroing out partial pages).
  76  */
  77 void truncate_inode_pages(struct inode * inode, unsigned long start)
     /*  */
  78 {
  79         struct page ** p;
  80         struct page * page;
  81 
  82 repeat:
  83         p = &inode->i_pages;
  84         while ((page = *p) != NULL) {
  85                 unsigned long offset = page->offset;
  86 
  87                 /* page wholly truncated - free it */
  88                 if (offset >= start) {
  89                         if (PageLocked(page)) {
  90                                 wait_on_page(page);
  91                                 goto repeat;
  92                         }
  93                         inode->i_nrpages--;
  94                         if ((*p = page->next) != NULL)
  95                                 (*p)->prev = page->prev;
  96                         page->dirty = 0;
  97                         page->next = NULL;
  98                         page->prev = NULL;
  99                         remove_page_from_hash_queue(page);
 100                         page->inode = NULL;
 101                         free_page(page_address(page));
 102                         continue;
 103                 }
 104                 p = &page->next;
 105                 offset = start - offset;
 106                 /* partial truncate, clear end of page */
 107                 if (offset < PAGE_SIZE)
 108                         memset((void *) (offset + page_address(page)), 0, PAGE_SIZE - offset);
 109         }
 110 }
 111 
 112 int shrink_mmap(int priority, int dma)
     /*  */
 113 {
 114         static int clock = 0;
 115         struct page * page;
 116         unsigned long limit = MAP_NR(high_memory);
 117         struct buffer_head *tmp, *bh;
 118 
 119         priority = (limit<<2) >> priority;
 120         page = mem_map + clock;
 121         do {
 122                 priority--;
 123                 if (PageLocked(page))
 124                         goto next;
 125                 if (dma && !PageDMA(page))
 126                         goto next;
 127                 /* First of all, regenerate the page's referenced bit
 128                    from any buffers in the page */
 129                 bh = page->buffers;
 130                 if (bh) {
 131                         tmp = bh;
 132                         do {
 133                                 if (buffer_touched(tmp)) {
 134                                         clear_bit(BH_Touched, &tmp->b_state);
 135                                         set_bit(PG_referenced, &page->flags);
 136                                 }
 137                                 tmp = tmp->b_this_page;
 138                         } while (tmp != bh);
 139                 }
 140 
 141                 /* We can't throw away shared pages, but we do mark
 142                    them as referenced.  This relies on the fact that
 143                    no page is currently in both the page cache and the
 144                    buffer cache; we'd have to modify the following
 145                    test to allow for that case. */
 146 
 147                 switch (page->count) {
 148                         case 1:
 149                                 /* If it has been referenced recently, don't free it */
 150                                 if (clear_bit(PG_referenced, &page->flags))
 151                                         break;
 152 
 153                                 /* is it a page cache page? */
 154                                 if (page->inode) {
 155                                         remove_page_from_hash_queue(page);
 156                                         remove_page_from_inode_queue(page);
 157                                         free_page(page_address(page));
 158                                         return 1;
 159                                 }
 160 
 161                                 /* is it a buffer cache page? */
 162                                 if (bh && try_to_free_buffer(bh, &bh, 6))
 163                                         return 1;
 164                                 break;
 165 
 166                         default:
 167                                 /* more than one users: we can't throw it away */
 168                                 set_bit(PG_referenced, &page->flags);
 169                                 /* fall through */
 170                         case 0:
 171                                 /* nothing */
 172                 }
 173 next:
 174                 page++;
 175                 clock++;
 176                 if (clock >= limit) {
 177                         clock = 0;
 178                         page = mem_map;
 179                 }
 180         } while (priority > 0);
 181         return 0;
 182 }
 183 
 184 /*
 185  * This is called from try_to_swap_out() when we try to get rid of some
 186  * pages..  If we're unmapping the last occurrence of this page, we also
 187  * free it from the page hash-queues etc, as we don't want to keep it
 188  * in-core unnecessarily.
 189  */
 190 unsigned long page_unuse(unsigned long page)
     /*  */
 191 {
 192         struct page * p = mem_map + MAP_NR(page);
 193         int count = p->count;
 194 
 195         if (count != 2)
 196                 return count;
 197         if (!p->inode)
 198                 return count;
 199         remove_page_from_hash_queue(p);
 200         remove_page_from_inode_queue(p);
 201         free_page(page);
 202         return 1;
 203 }
 204 
 205 /*
 206  * Update a page cache copy, when we're doing a "write()" system call
 207  * See also "update_vm_cache()".
 208  */
 209 void update_vm_cache(struct inode * inode, unsigned long pos, const char * buf, int count)
     /*  */
 210 {
 211         unsigned long offset, len;
 212 
 213         offset = (pos & ~PAGE_MASK);
 214         pos = pos & PAGE_MASK;
 215         len = PAGE_SIZE - offset;
 216         do {
 217                 struct page * page;
 218 
 219                 if (len > count)
 220                         len = count;
 221                 page = find_page(inode, pos);
 222                 if (page) {
 223                         unsigned long addr;
 224 
 225                         wait_on_page(page);
 226                         addr = page_address(page);
 227                         memcpy((void *) (offset + addr), buf, len);
 228                         free_page(addr);
 229                 }
 230                 count -= len;
 231                 buf += len;
 232                 len = PAGE_SIZE;
 233                 offset = 0;
 234                 pos += PAGE_SIZE;
 235         } while (count);
 236 }
 237 
 238 static inline void add_to_page_cache(struct page * page,
     /*  */
 239         struct inode * inode, unsigned long offset)
 240 {
 241         page->count++;
 242         page->flags &= ~((1 << PG_uptodate) | (1 << PG_error));
 243         page->offset = offset;
 244         add_page_to_inode_queue(inode, page);
 245         add_page_to_hash_queue(inode, page);
 246 }
 247 
 248 /*
 249  * Try to read ahead in the file. "page_cache" is a potentially free page
 250  * that we could use for the cache (if it is 0 we can try to create one,
 251  * this is all overlapped with the IO on the previous page finishing anyway)
 252  */
 253 static unsigned long try_to_read_ahead(struct inode * inode, unsigned long offset, unsigned long page_cache)
     /*  */
 254 {
 255         struct page * page;
 256 
 257         offset &= PAGE_MASK;
 258         if (!page_cache) {
 259                 page_cache = __get_free_page(GFP_KERNEL);
 260                 if (!page_cache)
 261                         return 0;
 262         }
 263         if (offset >= inode->i_size)
 264                 return page_cache;
 265 #if 1
 266         page = find_page(inode, offset);
 267         if (page) {
 268                 page->count--;
 269                 return page_cache;
 270         }
 271         /*
 272          * Ok, add the new page to the hash-queues...
 273          */
 274         page = mem_map + MAP_NR(page_cache);
 275         add_to_page_cache(page, inode, offset);
 276         inode->i_op->readpage(inode, page);
 277         free_page(page_cache);
 278         return 0;
 279 #else
 280         return page_cache;
 281 #endif
 282 }
 283 
 284 /* 
 285  * Wait for IO to complete on a locked page.
 286  */
 287 void __wait_on_page(struct page *page)
     /*  */
 288 {
 289         struct wait_queue wait = { current, NULL };
 290 
 291         page->count++;
 292         add_wait_queue(&page->wait, &wait);
 293 repeat:
 294         run_task_queue(&tq_disk);
 295         current->state = TASK_UNINTERRUPTIBLE;
 296         if (PageLocked(page)) {
 297                 schedule();
 298                 goto repeat;
 299         }
 300         remove_wait_queue(&page->wait, &wait);
 301         page->count--;
 302         current->state = TASK_RUNNING;
 303 }
 304 
 305 #if 0
 306 #define PROFILE_READAHEAD
 307 #define DEBUG_READAHEAD
 308 #endif
 309 
 310 /*
 311  * Read-ahead profiling informations
 312  * ---------------------------------
 313  * Every PROFILE_MAXREADCOUNT, the following informations are written 
 314  * to the syslog:
 315  *   Percentage of asynchronous read-ahead.
 316  *   Average of read-ahead fields context value.
 317  * If DEBUG_READAHEAD is defined, a snapshot of these fields is written 
 318  * to the syslog.
 319  */
 320 
 321 #ifdef PROFILE_READAHEAD
 322 
 323 #define PROFILE_MAXREADCOUNT 1000
 324 
 325 static unsigned long total_reada;
 326 static unsigned long total_async;
 327 static unsigned long total_ramax;
 328 static unsigned long total_ralen;
 329 static unsigned long total_rawin;
 330 
 331 static void profile_readahead(int async, struct file *filp)
     /*  */
 332 {
 333         unsigned long flags;
 334 
 335         ++total_reada;
 336         if (async)
 337                 ++total_async;
 338 
 339         total_ramax     += filp->f_ramax;
 340         total_ralen     += filp->f_ralen;
 341         total_rawin     += filp->f_rawin;
 342 
 343         if (total_reada > PROFILE_MAXREADCOUNT) {
 344                 save_flags(flags);
 345                 cli();
 346                 if (!(total_reada > PROFILE_MAXREADCOUNT)) {
 347                         restore_flags(flags);
 348                         return;
 349                 }
 350 
 351                 printk("Readahead average:  max=%ld, len=%ld, win=%ld, async=%ld%%\n",
 352                         total_ramax/total_reada,
 353                         total_ralen/total_reada,
 354                         total_rawin/total_reada,
 355                         (total_async*100)/total_reada);
 356 #ifdef DEBUG_READAHEAD
 357                 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, rapos=%ld\n",
 358                         filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_rapos);
 359 #endif
 360 
 361                 total_reada     = 0;
 362                 total_async     = 0;
 363                 total_ramax     = 0;
 364                 total_ralen     = 0;
 365                 total_rawin     = 0;
 366 
 367                 restore_flags(flags);
 368         }
 369 }
 370 #endif  /* defined PROFILE_READAHEAD */
 371 
 372 /*
 373  * Read-ahead context:
 374  * -------------------
 375  * The read ahead context fields of the "struct file" are the following:
 376  * - f_rapos : position of the first byte after the last page we tried to
 377  *             read ahead.
 378  * - f_ramax : current read-ahead maximum size.
 379  * - f_ralen : length of the current IO read block we tried to read-ahead.
 380  * - f_rawin : length of the current read-ahead window.
 381  *             if last read-ahead was synchronous then
 382  *                  f_rawin = f_ralen
 383  *             otherwise (was asynchronous)
 384  *                  f_rawin = previous value of f_ralen + f_ralen
 385  *
 386  * Read-ahead limits:
 387  * ------------------
 388  * MIN_READAHEAD   : minimum read-ahead size when read-ahead.
 389  * MAX_READAHEAD   : maximum read-ahead size when read-ahead.
 390  * MAX_READWINDOW  : maximum read window length.
 391  *
 392  * Synchronous read-ahead benefits:
 393  * --------------------------------
 394  * Using reasonnable IO xfer length from peripheral devices increase system 
 395  * performances.
 396  * Reasonnable means, in this context, not too large but not too small.
 397  * The actual maximum value is MAX_READAHEAD + PAGE_SIZE = 32k
 398  *
 399  * Asynchronous read-ahead benefits:
 400  * ---------------------------------
 401  * Overlapping next read request and user process execution increase system 
 402  * performance.
 403  *
 404  * Read-ahead risks:
 405  * -----------------
 406  * We have to guess which further data are needed by the user process.
 407  * If these data are often not really needed, it's bad for system 
 408  * performances.
 409  * However, we know that files are often accessed sequentially by 
 410  * application programs and it seems that it is possible to have some good 
 411  * strategy in that guessing.
 412  * We only try to read-ahead files that seems to be read sequentially.
 413  *
 414  * Asynchronous read-ahead risks:
 415  * ------------------------------
 416  * In order to maximize overlapping, we must start some asynchronous read 
 417  * request from the device, as soon as possible.
 418  * We must be very carefull about:
 419  * - The number of effective pending IO read requests.
 420  *   ONE seems to be the only reasonnable value.
 421  * - The total memory pool usage for the file access stream.
 422  *   We try to have a limit of MAX_READWINDOW = 48K.
 423  */
 424 
 425 #define MAX_READWINDOW (PAGE_SIZE*12)
 426 #define MAX_READAHEAD (PAGE_SIZE*7)
 427 #define MIN_READAHEAD (PAGE_SIZE)
 428 
 429 static inline unsigned long generic_file_readahead(struct file * filp, struct inode * inode,
     /*  */
 430         int try_async, unsigned long pos, struct page * page,
 431         unsigned long page_cache)
 432 {
 433         unsigned long max_ahead, ahead;
 434         unsigned long rapos, ppos;
 435 
 436         ppos = pos & PAGE_MASK;
 437         rapos = filp->f_rapos & PAGE_MASK;
 438         max_ahead = 0;
 439 /*
 440  * If the current page is locked, try some synchronous read-ahead in order
 441  * to avoid too small IO requests.
 442  */
 443         if (PageLocked(page)) {
 444                 rapos = ppos;
 445                 if (rapos < inode->i_size)
 446                         max_ahead = filp->f_ramax;
 447                 filp->f_rawin = 0;
 448                 filp->f_ralen = PAGE_SIZE;
 449         }
 450 /*
 451  * The current page is not locked
 452  * If the current position is inside the last read-ahead IO request,
 453  * it is the moment to try asynchronous read-ahead.
 454  * try_async = 2 means that we have to force unplug of the device in
 455  * order to force read IO asynchronously.
 456  */
 457         else if (try_async == 1 && rapos >= PAGE_SIZE &&
 458                  ppos <= rapos && ppos + filp->f_ralen >= rapos) {
 459 /*
 460  * Add ONE page to max_ahead in order to try to have about the same IO max size
 461  * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_SIZE.
 462  * Compute the position of the last page we have tried to read.
 463  */
 464                 rapos -= PAGE_SIZE;
 465                 if (rapos < inode->i_size)
 466                         max_ahead = filp->f_ramax + PAGE_SIZE;
 467 
 468                 if (max_ahead) {
 469                         filp->f_rawin = filp->f_ralen;
 470                         filp->f_ralen = 0;
 471                         try_async = 2;
 472                 }
 473         }
 474 /*
 475  * Try to read pages.
 476  * We hope that ll_rw_blk() plug/unplug, coalescence and sort will work fine
 477  * enough to avoid too bad actuals IO requests.
 478  */
 479         ahead = 0;
 480         while (ahead < max_ahead) {
 481                 ahead += PAGE_SIZE;
 482                 page_cache = try_to_read_ahead(inode, rapos + ahead, page_cache);
 483         }
 484 /*
 485  * If we tried to read some pages,
 486  * Update the read-ahead context.
 487  * Store the length of the current read-ahead window.
 488  * Add PAGE_SIZE to the max read ahead size each time we have read-ahead
 489  *   That recipe avoid to do some large IO for files that are not really
 490  *   accessed sequentially.
 491  * Do that only if the read ahead window is lower that MAX_READWINDOW
 492  * in order to limit the amount of pages used for this file access context.
 493  * If asynchronous,
 494  *    Try to force unplug of the device in order to start an asynchronous
 495  *    read IO request.
 496  */
 497         if (ahead) {
 498                 filp->f_ralen += ahead;
 499                 filp->f_rawin += filp->f_ralen;
 500                 filp->f_rapos = rapos + ahead + PAGE_SIZE;
 501 
 502                 if (filp->f_rawin < MAX_READWINDOW)
 503                         filp->f_ramax += PAGE_SIZE;
 504                 else if (filp->f_rawin > MAX_READWINDOW && filp->f_ramax > PAGE_SIZE)
 505                         filp->f_ramax -= PAGE_SIZE;
 506 
 507                 if (filp->f_ramax > MAX_READAHEAD)
 508                         filp->f_ramax = MAX_READAHEAD;
 509 #ifdef PROFILE_READAHEAD
 510                 profile_readahead((try_async == 2), filp);
 511 #endif
 512                 if (try_async == 2) {
 513                         run_task_queue(&tq_disk);
 514                 }
 515         }
 516 /*
 517  * Wait on the page if necessary
 518  */
 519         if (PageLocked(page)) {
 520                 __wait_on_page(page);
 521         }
 522         return page_cache;
 523 }
 524 
 525 
 526 /*
 527  * This is a generic file read routine, and uses the
 528  * inode->i_op->readpage() function for the actual low-level
 529  * stuff.
 530  *
 531  * This is really ugly. But the goto's actually try to clarify some
 532  * of the logic when it comes to error handling etc.
 533  */
 534 
 535 int generic_file_read(struct inode * inode, struct file * filp, char * buf, int count)
     /*  */
 536 {
 537         int error, read;
 538         unsigned long pos, ppos, page_cache;
 539         int try_async;
 540 
 541         if (count <= 0)
 542                 return 0;
 543 
 544         error = 0;
 545         read = 0;
 546         page_cache = 0;
 547 
 548         pos = filp->f_pos;
 549         ppos = pos & PAGE_MASK;
 550 /*
 551  * Check if the current position is inside the previous read-ahead window.
 552  * If that's true, We assume that the file accesses are sequential enough to
 553  * continue asynchronous read-ahead.
 554  * Do minimum read-ahead at the beginning of the file since some tools
 555  * only read the beginning of files.
 556  * Break read-ahead if the file position is outside the previous read ahead
 557  * window or if read-ahead position is 0.
 558  */
 559 /*
 560  * Will not try asynchronous read-ahead.
 561  * Reset to zero, read-ahead context.
 562  */
 563         if (pos+count < MIN_READAHEAD || !filp->f_rapos ||
 564             ppos > filp->f_rapos || ppos + filp->f_rawin < filp->f_rapos) {
 565                 try_async = 0;
 566                 filp->f_rapos = 0;
 567                 filp->f_ralen = 0;
 568                 filp->f_ramax = 0;
 569                 filp->f_rawin = 0;
 570 /*
 571  * Will try asynchronous read-ahead.
 572  */
 573         } else {
 574                 try_async = 1;
 575         }
 576 /*
 577  * Adjust the current value of read-ahead max.
 578  * If the read operation stay in the first half page, force no readahead.
 579  * Otherwise try first some value near count.
 580  *      do at least MIN_READAHEAD and at most MAX_READAHEAD.
 581  */
 582         if (pos + count <= (PAGE_SIZE >> 1)) {
 583                 try_async = 0;
 584                 filp->f_ramax = 0;
 585         } else {
 586                 if (filp->f_ramax < count)
 587                         filp->f_ramax = count & PAGE_MASK;
 588 
 589                 if (filp->f_ramax < MIN_READAHEAD)
 590                         filp->f_ramax = MIN_READAHEAD;
 591                 else if (filp->f_ramax > MAX_READAHEAD)
 592                         filp->f_ramax = MAX_READAHEAD;
 593         }
 594 
 595         for (;;) {
 596                 struct page *page;
 597                 unsigned long offset, addr, nr;
 598 
 599                 if (pos >= inode->i_size)
 600                         break;
 601                 offset = pos & ~PAGE_MASK;
 602                 nr = PAGE_SIZE - offset;
 603                 /*
 604                  * Try to find the data in the page cache..
 605                  */
 606                 page = find_page(inode, pos & PAGE_MASK);
 607                 if (page)
 608                         goto found_page;
 609 
 610                 /*
 611                  * Ok, it wasn't cached, so we need to create a new
 612                  * page..
 613                  */
 614                 if (page_cache)
 615                         goto new_page;
 616 
 617                 error = -ENOMEM;
 618                 page_cache = __get_free_page(GFP_KERNEL);
 619                 if (!page_cache)
 620                         break;
 621                 error = 0;
 622 
 623                 /*
 624                  * That could have slept, so we need to check again..
 625                  */
 626                 if (pos >= inode->i_size)
 627                         break;
 628                 page = find_page(inode, pos & PAGE_MASK);
 629                 if (!page)
 630                         goto new_page;
 631 
 632 found_page:
 633                 addr = page_address(page);
 634                 if (nr > count)
 635                         nr = count;
 636 /*
 637  * Do not try to readahead if the current page is not filled or being filled.
 638  * If our goal was to try asynchronous read-ahead, we were quite wrong.
 639  * Set max readahead to some shorter value in order to fix a little
 640  * this mistake.
 641  */
 642                 if (PageUptodate(page) || PageLocked(page))
 643                         page_cache = generic_file_readahead(filp, inode, try_async, pos, page, page_cache);
 644                 else if (try_async) {
 645                         if (filp->f_ramax > MIN_READAHEAD)
 646                                 filp->f_ramax -= PAGE_SIZE;
 647                 }
 648 
 649                 if (!PageUptodate(page))
 650                         goto read_page;
 651                 if (nr > inode->i_size - pos)
 652                         nr = inode->i_size - pos;
 653                 memcpy_tofs(buf, (void *) (addr + offset), nr);
 654                 free_page(addr);
 655                 buf += nr;
 656                 pos += nr;
 657                 read += nr;
 658                 count -= nr;
 659                 if (count)
 660                         continue;
 661                 break;
 662         
 663 
 664 new_page:
 665                 /*
 666                  * Ok, add the new page to the hash-queues...
 667                  */
 668                 addr = page_cache;
 669                 page = mem_map + MAP_NR(page_cache);
 670                 page_cache = 0;
 671                 add_to_page_cache(page, inode, pos & PAGE_MASK);
 672 
 673                 /*
 674                  * Error handling is tricky. If we get a read error,
 675                  * the cached page stays in the cache (but uptodate=0),
 676                  * and the next process that accesses it will try to
 677                  * re-read it. This is needed for NFS etc, where the
 678                  * identity of the reader can decide if we can read the
 679                  * page or not..
 680                  */
 681 read_page:
 682                 error = inode->i_op->readpage(inode, page);
 683                 if (!error)
 684                         goto found_page;
 685                 free_page(addr);
 686                 break;
 687         }
 688 
 689         filp->f_pos = pos;
 690         filp->f_reada = 1;
 691         if (page_cache)
 692                 free_page(page_cache);
 693         if (!IS_RDONLY(inode)) {
 694                 inode->i_atime = CURRENT_TIME;
 695                 inode->i_dirt = 1;
 696         }
 697         if (!read)
 698                 read = error;
 699         return read;
 700 }
 701 
 702 /*
 703  * Find a cached page and wait for it to become up-to-date, return
 704  * the page address.  Increments the page count.
 705  */
 706 static inline unsigned long fill_page(struct inode * inode, unsigned long offset)
     /*  */
 707 {
 708         struct page * page;
 709         unsigned long new_page;
 710 
 711         page = find_page(inode, offset);
 712         if (page)
 713                 goto found_page_dont_free;
 714         new_page = __get_free_page(GFP_KERNEL);
 715         page = find_page(inode, offset);
 716         if (page)
 717                 goto found_page;
 718         if (!new_page)
 719                 return 0;
 720         page = mem_map + MAP_NR(new_page);
 721         new_page = 0;
 722         add_to_page_cache(page, inode, offset);
 723         inode->i_op->readpage(inode, page);
 724         if (PageLocked(page))
 725                 new_page = try_to_read_ahead(inode, offset + PAGE_SIZE, 0);
 726 found_page:
 727         if (new_page)
 728                 free_page(new_page);
 729 found_page_dont_free:
 730         wait_on_page(page);
 731         return page_address(page);
 732 }
 733 
 734 /*
 735  * Semantics for shared and private memory areas are different past the end
 736  * of the file. A shared mapping past the last page of the file is an error
 737  * and results in a SIGBUS, while a private mapping just maps in a zero page.
 738  */
 739 static unsigned long filemap_nopage(struct vm_area_struct * area, unsigned long address, int no_share)
     /*  */
 740 {
 741         unsigned long offset;
 742         struct inode * inode = area->vm_inode;
 743         unsigned long page;
 744 
 745         offset = (address & PAGE_MASK) - area->vm_start + area->vm_offset;
 746         if (offset >= inode->i_size && (area->vm_flags & VM_SHARED) && area->vm_mm == current->mm)
 747                 return 0;
 748 
 749         page = fill_page(inode, offset);
 750         if (page && no_share) {
 751                 unsigned long new_page = __get_free_page(GFP_KERNEL);
 752                 if (new_page)
 753                         memcpy((void *) new_page, (void *) page, PAGE_SIZE);
 754                 free_page(page);
 755                 return new_page;
 756         }
 757         return page;
 758 }
 759 
 760 /*
 761  * Tries to write a shared mapped page to its backing store. May return -EIO
 762  * if the disk is full.
 763  */
 764 static inline int do_write_page(struct inode * inode, struct file * file,
     /*  */
 765         const char * page, unsigned long offset)
 766 {
 767         int old_fs, retval;
 768         unsigned long size;
 769 
 770         size = offset + PAGE_SIZE;
 771         /* refuse to extend file size.. */
 772         if (S_ISREG(inode->i_mode)) {
 773                 if (size > inode->i_size)
 774                         size = inode->i_size;
 775                 /* Ho humm.. We should have tested for this earlier */
 776                 if (size < offset)
 777                         return -EIO;
 778         }
 779         size -= offset;
 780         old_fs = get_fs();
 781         set_fs(KERNEL_DS);
 782         retval = -EIO;
 783         if (size == file->f_op->write(inode, file, (const char *) page, size))
 784                 retval = 0;
 785         set_fs(old_fs);
 786         return retval;
 787 }
 788 
 789 static int filemap_write_page(struct vm_area_struct * vma,
     /*  */
 790         unsigned long offset,
 791         unsigned long page)
 792 {
 793         int result;
 794         struct file file;
 795         struct inode * inode;
 796         struct buffer_head * bh;
 797 
 798         bh = mem_map[MAP_NR(page)].buffers;
 799         if (bh) {
 800                 /* whee.. just mark the buffer heads dirty */
 801                 struct buffer_head * tmp = bh;
 802                 do {
 803                         mark_buffer_dirty(tmp, 0);
 804                         tmp = tmp->b_this_page;
 805                 } while (tmp != bh);
 806                 return 0;
 807         }
 808 
 809         inode = vma->vm_inode;
 810         file.f_op = inode->i_op->default_file_ops;
 811         if (!file.f_op->write)
 812                 return -EIO;
 813         file.f_mode = 3;
 814         file.f_flags = 0;
 815         file.f_count = 1;
 816         file.f_inode = inode;
 817         file.f_pos = offset;
 818         file.f_reada = 0;
 819 
 820         down(&inode->i_sem);
 821         result = do_write_page(inode, &file, (const char *) page, offset);
 822         up(&inode->i_sem);
 823         return result;
 824 }
 825 
 826 
 827 /*
 828  * Swapping to a shared file: while we're busy writing out the page
 829  * (and the page still exists in memory), we save the page information
 830  * in the page table, so that "filemap_swapin()" can re-use the page
 831  * immediately if it is called while we're busy swapping it out..
 832  *
 833  * Once we've written it all out, we mark the page entry "empty", which
 834  * will result in a normal page-in (instead of a swap-in) from the now
 835  * up-to-date disk file.
 836  */
 837 int filemap_swapout(struct vm_area_struct * vma,
     /*  */
 838         unsigned long offset,
 839         pte_t *page_table)
 840 {
 841         int error;
 842         unsigned long page = pte_page(*page_table);
 843         unsigned long entry = SWP_ENTRY(SHM_SWP_TYPE, MAP_NR(page));
 844 
 845         flush_cache_page(vma, (offset + vma->vm_start - vma->vm_offset));
 846         set_pte(page_table, __pte(entry));
 847         flush_tlb_page(vma, (offset + vma->vm_start - vma->vm_offset));
 848         error = filemap_write_page(vma, offset, page);
 849         if (pte_val(*page_table) == entry)
 850                 pte_clear(page_table);
 851         return error;
 852 }
 853 
 854 /*
 855  * filemap_swapin() is called only if we have something in the page
 856  * tables that is non-zero (but not present), which we know to be the
 857  * page index of a page that is busy being swapped out (see above).
 858  * So we just use it directly..
 859  */
 860 static pte_t filemap_swapin(struct vm_area_struct * vma,
     /*  */
 861         unsigned long offset,
 862         unsigned long entry)
 863 {
 864         unsigned long page = SWP_OFFSET(entry);
 865 
 866         mem_map[page].count++;
 867         page = (page << PAGE_SHIFT) + PAGE_OFFSET;
 868         return mk_pte(page,vma->vm_page_prot);
 869 }
 870 
 871 
 872 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
     /*  */
 873         unsigned long address, unsigned int flags)
 874 {
 875         pte_t pte = *ptep;
 876         unsigned long page;
 877         int error;
 878 
 879         if (!(flags & MS_INVALIDATE)) {
 880                 if (!pte_present(pte))
 881                         return 0;
 882                 if (!pte_dirty(pte))
 883                         return 0;
 884                 flush_cache_page(vma, address);
 885                 set_pte(ptep, pte_mkclean(pte));
 886                 flush_tlb_page(vma, address);
 887                 page = pte_page(pte);
 888                 mem_map[MAP_NR(page)].count++;
 889         } else {
 890                 if (pte_none(pte))
 891                         return 0;
 892                 flush_cache_page(vma, address);
 893                 pte_clear(ptep);
 894                 flush_tlb_page(vma, address);
 895                 if (!pte_present(pte)) {
 896                         swap_free(pte_val(pte));
 897                         return 0;
 898                 }
 899                 page = pte_page(pte);
 900                 if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
 901                         free_page(page);
 902                         return 0;
 903                 }
 904         }
 905         error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page);
 906         free_page(page);
 907         return error;
 908 }
 909 
 910 static inline int filemap_sync_pte_range(pmd_t * pmd,
     /*  */
 911         unsigned long address, unsigned long size, 
 912         struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
 913 {
 914         pte_t * pte;
 915         unsigned long end;
 916         int error;
 917 
 918         if (pmd_none(*pmd))
 919                 return 0;
 920         if (pmd_bad(*pmd)) {
 921                 printk("filemap_sync_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
 922                 pmd_clear(pmd);
 923                 return 0;
 924         }
 925         pte = pte_offset(pmd, address);
 926         offset += address & PMD_MASK;
 927         address &= ~PMD_MASK;
 928         end = address + size;
 929         if (end > PMD_SIZE)
 930                 end = PMD_SIZE;
 931         error = 0;
 932         do {
 933                 error |= filemap_sync_pte(pte, vma, address + offset, flags);
 934                 address += PAGE_SIZE;
 935                 pte++;
 936         } while (address < end);
 937         return error;
 938 }
 939 
 940 static inline int filemap_sync_pmd_range(pgd_t * pgd,
     /*  */
 941         unsigned long address, unsigned long size, 
 942         struct vm_area_struct *vma, unsigned int flags)
 943 {
 944         pmd_t * pmd;
 945         unsigned long offset, end;
 946         int error;
 947 
 948         if (pgd_none(*pgd))
 949                 return 0;
 950         if (pgd_bad(*pgd)) {
 951                 printk("filemap_sync_pmd_range: bad pgd (%08lx)\n", pgd_val(*pgd));
 952                 pgd_clear(pgd);
 953                 return 0;
 954         }
 955         pmd = pmd_offset(pgd, address);
 956         offset = address & PGDIR_MASK;
 957         address &= ~PGDIR_MASK;
 958         end = address + size;
 959         if (end > PGDIR_SIZE)
 960                 end = PGDIR_SIZE;
 961         error = 0;
 962         do {
 963                 error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
 964                 address = (address + PMD_SIZE) & PMD_MASK;
 965                 pmd++;
 966         } while (address < end);
 967         return error;
 968 }
 969 
 970 static int filemap_sync(struct vm_area_struct * vma, unsigned long address,
     /*  */
 971         size_t size, unsigned int flags)
 972 {
 973         pgd_t * dir;
 974         unsigned long end = address + size;
 975         int error = 0;
 976 
 977         dir = pgd_offset(current->mm, address);
 978         flush_cache_range(vma->vm_mm, end - size, end);
 979         while (address < end) {
 980                 error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
 981                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 982                 dir++;
 983         }
 984         flush_tlb_range(vma->vm_mm, end - size, end);
 985         return error;
 986 }
 987 
 988 /*
 989  * This handles (potentially partial) area unmaps..
 990  */
 991 static void filemap_unmap(struct vm_area_struct *vma, unsigned long start, size_t len)
     /*  */
 992 {
 993         filemap_sync(vma, start, len, MS_ASYNC);
 994 }
 995 
 996 /*
 997  * Shared mappings need to be able to do the right thing at
 998  * close/unmap/sync. They will also use the private file as
 999  * backing-store for swapping..
1000  */
1001 static struct vm_operations_struct file_shared_mmap = {
1002         NULL,                   /* no special open */
1003         NULL,                   /* no special close */
1004         filemap_unmap,          /* unmap - we need to sync the pages */
1005         NULL,                   /* no special protect */
1006         filemap_sync,           /* sync */
1007         NULL,                   /* advise */
1008         filemap_nopage,         /* nopage */
1009         NULL,                   /* wppage */
1010         filemap_swapout,        /* swapout */
1011         filemap_swapin,         /* swapin */
1012 };
1013 
1014 /*
1015  * Private mappings just need to be able to load in the map.
1016  *
1017  * (This is actually used for shared mappings as well, if we
1018  * know they can't ever get write permissions..)
1019  */
1020 static struct vm_operations_struct file_private_mmap = {
1021         NULL,                   /* open */
1022         NULL,                   /* close */
1023         NULL,                   /* unmap */
1024         NULL,                   /* protect */
1025         NULL,                   /* sync */
1026         NULL,                   /* advise */
1027         filemap_nopage,         /* nopage */
1028         NULL,                   /* wppage */
1029         NULL,                   /* swapout */
1030         NULL,                   /* swapin */
1031 };
1032 
1033 /* This is used for a general mmap of a disk file */
1034 int generic_file_mmap(struct inode * inode, struct file * file, struct vm_area_struct * vma)
     /*  */
1035 {
1036         struct vm_operations_struct * ops;
1037 
1038         if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
1039                 ops = &file_shared_mmap;
1040                 /* share_page() can only guarantee proper page sharing if
1041                  * the offsets are all page aligned. */
1042                 if (vma->vm_offset & (PAGE_SIZE - 1))
1043                         return -EINVAL;
1044         } else {
1045                 ops = &file_private_mmap;
1046                 if (vma->vm_offset & (inode->i_sb->s_blocksize - 1))
1047                         return -EINVAL;
1048         }
1049         if (!inode->i_sb || !S_ISREG(inode->i_mode))
1050                 return -EACCES;
1051         if (!inode->i_op || !inode->i_op->readpage)
1052                 return -ENOEXEC;
1053         if (!IS_RDONLY(inode)) {
1054                 inode->i_atime = CURRENT_TIME;
1055                 inode->i_dirt = 1;
1056         }
1057         vma->vm_inode = inode;
1058         inode->i_count++;
1059         vma->vm_ops = ops;
1060         return 0;
1061 }
1062 
1063 
1064 /*
1065  * The msync() system call.
1066  */
1067 
1068 static int msync_interval(struct vm_area_struct * vma,
     /*  */
1069         unsigned long start, unsigned long end, int flags)
1070 {
1071         if (!vma->vm_inode)
1072                 return 0;
1073         if (vma->vm_ops->sync) {
1074                 int error;
1075                 error = vma->vm_ops->sync(vma, start, end-start, flags);
1076                 if (error)
1077                         return error;
1078                 if (flags & MS_SYNC)
1079                         return file_fsync(vma->vm_inode, NULL);
1080                 return 0;
1081         }
1082         return 0;
1083 }
1084 
1085 asmlinkage int sys_msync(unsigned long start, size_t len, int flags)
     /*  */
1086 {
1087         unsigned long end;
1088         struct vm_area_struct * vma;
1089         int unmapped_error, error;
1090 
1091         if (start & ~PAGE_MASK)
1092                 return -EINVAL;
1093         len = (len + ~PAGE_MASK) & PAGE_MASK;
1094         end = start + len;
1095         if (end < start)
1096                 return -EINVAL;
1097         if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
1098                 return -EINVAL;
1099         if (end == start)
1100                 return 0;
1101         /*
1102          * If the interval [start,end) covers some unmapped address ranges,
1103          * just ignore them, but return -EFAULT at the end.
1104          */
1105         vma = find_vma(current, start);
1106         unmapped_error = 0;
1107         for (;;) {
1108                 /* Still start < end. */
1109                 if (!vma)
1110                         return -EFAULT;
1111                 /* Here start < vma->vm_end. */
1112                 if (start < vma->vm_start) {
1113                         unmapped_error = -EFAULT;
1114                         start = vma->vm_start;
1115                 }
1116                 /* Here vma->vm_start <= start < vma->vm_end. */
1117                 if (end <= vma->vm_end) {
1118                         if (start < end) {
1119                                 error = msync_interval(vma, start, end, flags);
1120                                 if (error)
1121                                         return error;
1122                         }
1123                         return unmapped_error;
1124                 }
1125                 /* Here vma->vm_start <= start < vma->vm_end < end. */
1126                 error = msync_interval(vma, start, vma->vm_end, flags);
1127                 if (error)
1128                         return error;
1129                 start = vma->vm_end;
1130                 vma = vma->vm_next;
1131         }
1132 }
/* */
root/mm/filemap.c

DEFINITIONS