1 /* 2 * malloc.c --- a general purpose kernel memory allocator for Linux. 3 * 4 * Written by Theodore Ts'o (tytso@mit.edu), 11/29/91 5 * 6 * This routine is written to be as fast as possible, so that it 7 * can be called from the interrupt level. 8 * 9 * Limitations: maximum size of memory we can allocate using this routine 10 * is 4k, the size of a page in Linux. 11 * 12 * The general game plan is that each page (called a bucket) will only hold 13 * objects of a given size. When all of the object on a page are released, 14 * the page can be returned to the general free pool. When kmalloc() is 15 * called, it looks for the smallest bucket size which will fulfill its 16 * request, and allocate a piece of memory from that bucket pool. 17 * 18 * Each bucket has as its control block a bucket descriptor which keeps 19 * track of how many objects are in use on that page, and the free list 20 * for that page. Like the buckets themselves, bucket descriptors are 21 * stored on pages requested from get_free_page(). However, unlike buckets, 22 * pages devoted to bucket descriptor pages are never released back to the 23 * system. Fortunately, a system should probably only need 1 or 2 bucket 24 * descriptor pages, since a page can hold 256 bucket descriptors (which 25 * corresponds to 1 megabyte worth of bucket pages.) If the kernel is using 26 * that much allocated memory, it's probably doing something wrong. :-) 27 * 28 * Note: kmalloc() and kfree() both call get_free_page() and free_page() 29 * in sections of code where interrupts are turned off, to allow 30 * kmalloc() and kfree() to be safely called from an interrupt routine. 31 * (We will probably need this functionality when networking code, 32 * particularily things like NFS, is added to Linux.) However, this 33 * presumes that get_free_page() and free_page() are interrupt-level 34 * safe, which they may not be once paging is added. If this is the 35 * case, we will need to modify kmalloc() to keep a few unused pages 36 * "pre-allocated" so that it can safely draw upon those pages if 37 * it is called from an interrupt routine. 38 * 39 * Another concern is that get_free_page() should not sleep; if it 40 * does, the code is carefully ordered so as to avoid any race 41 * conditions. The catch is that if kmalloc() is called re-entrantly, 42 * there is a chance that unecessary pages will be grabbed from the 43 * system. Except for the pages for the bucket descriptor page, the 44 * extra pages will eventually get released back to the system, though, 45 * so it isn't all that bad. 46 */ 47 48 /* I'm going to modify it to keep some free pages around. Get free page 49 can sleep, and tcp/ip needs to call kmalloc at interrupt time (Or keep 50 big buffers around for itself.) I guess I'll have return from 51 syscall fill up the free page descriptors. -RAB */ 52 53 /* since the advent of GFP_ATOMIC, I've changed the kmalloc code to 54 use it and return NULL if it can't get a page. -RAB */ 55 /* (mostly just undid the previous changes -RAB) */ 56 57 /* I've added the priority argument to kmalloc so routines can 58 sleep on memory if they want. - RAB */ 59 60 /* I've also got to make sure that kmalloc is reentrant now. */ 61 62 #include <linux/kernel.h> 63 #include <linux/mm.h> 64 #include <linux/string.h> 65 66 #include <asm/system.h> 67 68 struct bucket_desc { /* 16 bytes */ 69 void *page; 70 struct bucket_desc *next; 71 void *freeptr; 72 unsigned short refcnt; 73 unsigned short bucket_size; 74 }; 75 76 struct _bucket_dir { /* 8 bytes */ 77 int size; 78 struct bucket_desc *chain; 79 }; 80 81 /* 82 * The following is the where we store a pointer to the first bucket 83 * descriptor for a given size. 84 * 85 * If it turns out that the Linux kernel allocates a lot of objects of a 86 * specific size, then we may want to add that specific size to this list, 87 * since that will allow the memory to be allocated more efficiently. 88 * However, since an entire page must be dedicated to each specific size 89 * on this list, some amount of temperance must be exercised here. 90 * 91 * Note that this list *must* be kept in order. 92 */ 93 struct _bucket_dir bucket_dir[] = { 94 { 16, (struct bucket_desc *) 0}, 95 { 32, (struct bucket_desc *) 0}, 96 { 64, (struct bucket_desc *) 0}, 97 { 128, (struct bucket_desc *) 0}, 98 { 256, (struct bucket_desc *) 0}, 99 { 512, (struct bucket_desc *) 0}, 100 { 1024, (struct bucket_desc *) 0}, 101 { 2048, (struct bucket_desc *) 0}, 102 { 4096, (struct bucket_desc *) 0}, 103 { 0, (struct bucket_desc *) 0}}; /* End of list marker */ 104 105 /* 106 * This contains a linked list of free bucket descriptor blocks 107 */ 108 static struct bucket_desc *free_bucket_desc = (struct bucket_desc *) 0; 109 110 /* 111 * This routine initializes a bucket description page. 112 */ 113 114 /* It assumes it is called with interrupts on. and will 115 return that way. It also can sleep if priority != GFP_ATOMIC. */ 116 117 static inline int init_bucket_desc(int priority) /* */ 118 { 119 struct bucket_desc *bdesc, *first; 120 int i; 121 /* this turns interrupt on, so we should be carefull. */ 122 first = bdesc = (struct bucket_desc *) get_free_page(priority); 123 if (!bdesc) 124 return 1; 125 126 /* At this point it is possible that we have slept and 127 free has been called etc. So we might not actually need 128 this page anymore. */ 129 130 for (i = PAGE_SIZE/sizeof(struct bucket_desc); i > 1; i--) { 131 bdesc->next = bdesc+1; 132 bdesc++; 133 } 134 /* 135 * This is done last, to avoid race conditions in case 136 * get_free_page() sleeps and this routine gets called again.... 137 */ 138 139 cli(); 140 bdesc->next = free_bucket_desc; 141 free_bucket_desc = first; 142 sti(); 143 return (0); 144 } 145 146 void * 147 kmalloc(unsigned int len, int priority) /* */ 148 { 149 struct _bucket_dir *bdir; 150 struct bucket_desc *bdesc; 151 void *retval; 152 153 /* 154 * First we search the bucket_dir to find the right bucket change 155 * for this request. 156 */ 157 158 /* The sizes are static so there is no reentry problem here. */ 159 for (bdir = bucket_dir; bdir->size; bdir++) 160 if (bdir->size >= len) 161 break; 162 163 if (!bdir->size) { 164 /* This should be changed for sizes > 1 page. */ 165 printk("kmalloc called with impossibly large argument (%d)\n", len); 166 return NULL; 167 } 168 169 /* 170 * Now we search for a bucket descriptor which has free space 171 */ 172 cli(); /* Avoid race conditions */ 173 for (bdesc = bdir->chain; bdesc != NULL; bdesc = bdesc->next) 174 if (bdesc->freeptr != NULL || bdesc->page == NULL) 175 break; 176 /* 177 * If we didn't find a bucket with free space, then we'll 178 * allocate a new one. 179 */ 180 if (!bdesc) 181 { 182 char *cp; 183 int i; 184 185 /* This must be a while because it is possible 186 to get interrupted after init_bucket_desc 187 and before cli. The interrupt could steal 188 our free_desc. */ 189 190 while (!free_bucket_desc) 191 { 192 sti(); /* This might happen anyway, so we 193 might as well make it explicit. */ 194 if (init_bucket_desc(priority)) 195 { 196 return NULL; 197 } 198 cli(); /* Turn them back off. */ 199 } 200 201 bdesc = free_bucket_desc; 202 free_bucket_desc = bdesc->next; 203 204 /* get_free_page will turn interrupts back 205 on. So we might as well do it 206 ourselves. */ 207 208 sti(); 209 bdesc->refcnt = 0; 210 bdesc->bucket_size = bdir->size; 211 bdesc->page = bdesc->freeptr = 212 (void *)cp = get_free_page(priority); 213 214 if (!cp) 215 { 216 217 /* put bdesc back on the free list. */ 218 cli(); 219 bdesc->next = free_bucket_desc; 220 free_bucket_desc = bdesc->next; 221 sti(); 222 223 return NULL; 224 } 225 226 /* Set up the chain of free objects */ 227 for (i=PAGE_SIZE/bdir->size; i > 1; i--) 228 { 229 *((char **) cp) = cp + bdir->size; 230 cp += bdir->size; 231 } 232 233 *((char **) cp) = 0; 234 235 /* turn interrupts back off for putting the 236 thing onto the chain. */ 237 cli(); 238 /* remember bdir is not changed. */ 239 bdesc->next = bdir->chain; /* OK, link it in! */ 240 bdir->chain = bdesc; 241 242 } 243 244 retval = (void *) bdesc->freeptr; 245 bdesc->freeptr = *((void **) retval); 246 bdesc->refcnt++; 247 sti(); /* OK, we're safe again */ 248 return retval; 249 } 250 251 /* 252 * Here is the kfree routine. If you know the size of the object that you 253 * are freeing, then kfree_s() will use that information to speed up the 254 * search for the bucket descriptor. 255 * 256 * We will #define a macro so that "kfree(x)" is becomes "kfree_s(x, 0)" 257 */ 258 void kfree_s(void *obj, int size) /* */ 259 { 260 void *page; 261 struct _bucket_dir *bdir; 262 struct bucket_desc *bdesc, *prev; 263 264 /* Calculate what page this object lives in */ 265 page = (void *) ((unsigned long) obj & 0xfffff000); 266 267 /* Now search the buckets looking for that page */ 268 for (bdir = bucket_dir; bdir->size; bdir++) 269 { 270 prev = 0; 271 /* If size is zero then this conditional is always true */ 272 if (bdir->size >= size) 273 { 274 /* We have to turn off interrupts here because 275 we are descending the chain. If something 276 changes it in the middle we could suddenly 277 find ourselves descending the free list. 278 I think this would only cause a memory 279 leak, but better safe than sorry. */ 280 cli(); /* To avoid race conditions */ 281 for (bdesc = bdir->chain; bdesc; bdesc = bdesc->next) 282 { 283 if (bdesc->page == page) 284 goto found; 285 prev = bdesc; 286 } 287 } 288 } 289 290 printk("Bad address passed to kernel kfree_s(%X, %d)\n",obj, size); 291 sti(); 292 return; 293 found: 294 /* interrupts are off here. */ 295 *((void **)obj) = bdesc->freeptr; 296 bdesc->freeptr = obj; 297 bdesc->refcnt--; 298 if (bdesc->refcnt == 0) { 299 /* 300 * We need to make sure that prev is still accurate. It 301 * may not be, if someone rudely interrupted us.... 302 */ 303 if ((prev && (prev->next != bdesc)) || 304 (!prev && (bdir->chain != bdesc))) 305 for (prev = bdir->chain; prev; prev = prev->next) 306 if (prev->next == bdesc) 307 break; 308 if (prev) 309 prev->next = bdesc->next; 310 else { 311 if (bdir->chain != bdesc) 312 panic("kmalloc bucket chains corrupted"); 313 bdir->chain = bdesc->next; 314 } 315 bdesc->next = free_bucket_desc; 316 free_bucket_desc = bdesc; 317 free_page((unsigned long) bdesc->page); 318 } 319 sti(); 320 return; 321 }