Btrfs: Raise thresholds for metadata writeback
[linux-2.6] / kernel / power / snapshot.c
1 /*
2  * linux/kernel/power/snapshot.c
3  *
4  * This file provides system snapshot/restore functionality for swsusp.
5  *
6  * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz>
7  * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
8  *
9  * This file is released under the GPLv2.
10  *
11  */
12
13 #include <linux/version.h>
14 #include <linux/module.h>
15 #include <linux/mm.h>
16 #include <linux/suspend.h>
17 #include <linux/delay.h>
18 #include <linux/bitops.h>
19 #include <linux/spinlock.h>
20 #include <linux/kernel.h>
21 #include <linux/pm.h>
22 #include <linux/device.h>
23 #include <linux/init.h>
24 #include <linux/bootmem.h>
25 #include <linux/syscalls.h>
26 #include <linux/console.h>
27 #include <linux/highmem.h>
28
29 #include <asm/uaccess.h>
30 #include <asm/mmu_context.h>
31 #include <asm/pgtable.h>
32 #include <asm/tlbflush.h>
33 #include <asm/io.h>
34
35 #include "power.h"
36
37 static int swsusp_page_is_free(struct page *);
38 static void swsusp_set_page_forbidden(struct page *);
39 static void swsusp_unset_page_forbidden(struct page *);
40
41 /* List of PBEs needed for restoring the pages that were allocated before
42  * the suspend and included in the suspend image, but have also been
43  * allocated by the "resume" kernel, so their contents cannot be written
44  * directly to their "original" page frames.
45  */
46 struct pbe *restore_pblist;
47
48 /* Pointer to an auxiliary buffer (1 page) */
49 static void *buffer;
50
51 /**
52  *      @safe_needed - on resume, for storing the PBE list and the image,
53  *      we can only use memory pages that do not conflict with the pages
54  *      used before suspend.  The unsafe pages have PageNosaveFree set
55  *      and we count them using unsafe_pages.
56  *
57  *      Each allocated image page is marked as PageNosave and PageNosaveFree
58  *      so that swsusp_free() can release it.
59  */
60
61 #define PG_ANY          0
62 #define PG_SAFE         1
63 #define PG_UNSAFE_CLEAR 1
64 #define PG_UNSAFE_KEEP  0
65
66 static unsigned int allocated_unsafe_pages;
67
68 static void *get_image_page(gfp_t gfp_mask, int safe_needed)
69 {
70         void *res;
71
72         res = (void *)get_zeroed_page(gfp_mask);
73         if (safe_needed)
74                 while (res && swsusp_page_is_free(virt_to_page(res))) {
75                         /* The page is unsafe, mark it for swsusp_free() */
76                         swsusp_set_page_forbidden(virt_to_page(res));
77                         allocated_unsafe_pages++;
78                         res = (void *)get_zeroed_page(gfp_mask);
79                 }
80         if (res) {
81                 swsusp_set_page_forbidden(virt_to_page(res));
82                 swsusp_set_page_free(virt_to_page(res));
83         }
84         return res;
85 }
86
87 unsigned long get_safe_page(gfp_t gfp_mask)
88 {
89         return (unsigned long)get_image_page(gfp_mask, PG_SAFE);
90 }
91
92 static struct page *alloc_image_page(gfp_t gfp_mask)
93 {
94         struct page *page;
95
96         page = alloc_page(gfp_mask);
97         if (page) {
98                 swsusp_set_page_forbidden(page);
99                 swsusp_set_page_free(page);
100         }
101         return page;
102 }
103
104 /**
105  *      free_image_page - free page represented by @addr, allocated with
106  *      get_image_page (page flags set by it must be cleared)
107  */
108
109 static inline void free_image_page(void *addr, int clear_nosave_free)
110 {
111         struct page *page;
112
113         BUG_ON(!virt_addr_valid(addr));
114
115         page = virt_to_page(addr);
116
117         swsusp_unset_page_forbidden(page);
118         if (clear_nosave_free)
119                 swsusp_unset_page_free(page);
120
121         __free_page(page);
122 }
123
124 /* struct linked_page is used to build chains of pages */
125
126 #define LINKED_PAGE_DATA_SIZE   (PAGE_SIZE - sizeof(void *))
127
128 struct linked_page {
129         struct linked_page *next;
130         char data[LINKED_PAGE_DATA_SIZE];
131 } __attribute__((packed));
132
133 static inline void
134 free_list_of_pages(struct linked_page *list, int clear_page_nosave)
135 {
136         while (list) {
137                 struct linked_page *lp = list->next;
138
139                 free_image_page(list, clear_page_nosave);
140                 list = lp;
141         }
142 }
143
144 /**
145   *     struct chain_allocator is used for allocating small objects out of
146   *     a linked list of pages called 'the chain'.
147   *
148   *     The chain grows each time when there is no room for a new object in
149   *     the current page.  The allocated objects cannot be freed individually.
150   *     It is only possible to free them all at once, by freeing the entire
151   *     chain.
152   *
153   *     NOTE: The chain allocator may be inefficient if the allocated objects
154   *     are not much smaller than PAGE_SIZE.
155   */
156
157 struct chain_allocator {
158         struct linked_page *chain;      /* the chain */
159         unsigned int used_space;        /* total size of objects allocated out
160                                          * of the current page
161                                          */
162         gfp_t gfp_mask;         /* mask for allocating pages */
163         int safe_needed;        /* if set, only "safe" pages are allocated */
164 };
165
166 static void
167 chain_init(struct chain_allocator *ca, gfp_t gfp_mask, int safe_needed)
168 {
169         ca->chain = NULL;
170         ca->used_space = LINKED_PAGE_DATA_SIZE;
171         ca->gfp_mask = gfp_mask;
172         ca->safe_needed = safe_needed;
173 }
174
175 static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
176 {
177         void *ret;
178
179         if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) {
180                 struct linked_page *lp;
181
182                 lp = get_image_page(ca->gfp_mask, ca->safe_needed);
183                 if (!lp)
184                         return NULL;
185
186                 lp->next = ca->chain;
187                 ca->chain = lp;
188                 ca->used_space = 0;
189         }
190         ret = ca->chain->data + ca->used_space;
191         ca->used_space += size;
192         return ret;
193 }
194
195 static void chain_free(struct chain_allocator *ca, int clear_page_nosave)
196 {
197         free_list_of_pages(ca->chain, clear_page_nosave);
198         memset(ca, 0, sizeof(struct chain_allocator));
199 }
200
201 /**
202  *      Data types related to memory bitmaps.
203  *
204  *      Memory bitmap is a structure consiting of many linked lists of
205  *      objects.  The main list's elements are of type struct zone_bitmap
206  *      and each of them corresonds to one zone.  For each zone bitmap
207  *      object there is a list of objects of type struct bm_block that
208  *      represent each blocks of bitmap in which information is stored.
209  *
210  *      struct memory_bitmap contains a pointer to the main list of zone
211  *      bitmap objects, a struct bm_position used for browsing the bitmap,
212  *      and a pointer to the list of pages used for allocating all of the
213  *      zone bitmap objects and bitmap block objects.
214  *
215  *      NOTE: It has to be possible to lay out the bitmap in memory
216  *      using only allocations of order 0.  Additionally, the bitmap is
217  *      designed to work with arbitrary number of zones (this is over the
218  *      top for now, but let's avoid making unnecessary assumptions ;-).
219  *
220  *      struct zone_bitmap contains a pointer to a list of bitmap block
221  *      objects and a pointer to the bitmap block object that has been
222  *      most recently used for setting bits.  Additionally, it contains the
223  *      pfns that correspond to the start and end of the represented zone.
224  *
225  *      struct bm_block contains a pointer to the memory page in which
226  *      information is stored (in the form of a block of bitmap)
227  *      It also contains the pfns that correspond to the start and end of
228  *      the represented memory area.
229  */
230
231 #define BM_END_OF_MAP   (~0UL)
232
233 #define BM_BITS_PER_BLOCK       (PAGE_SIZE << 3)
234
235 struct bm_block {
236         struct bm_block *next;          /* next element of the list */
237         unsigned long start_pfn;        /* pfn represented by the first bit */
238         unsigned long end_pfn;  /* pfn represented by the last bit plus 1 */
239         unsigned long *data;    /* bitmap representing pages */
240 };
241
242 static inline unsigned long bm_block_bits(struct bm_block *bb)
243 {
244         return bb->end_pfn - bb->start_pfn;
245 }
246
247 struct zone_bitmap {
248         struct zone_bitmap *next;       /* next element of the list */
249         unsigned long start_pfn;        /* minimal pfn in this zone */
250         unsigned long end_pfn;          /* maximal pfn in this zone plus 1 */
251         struct bm_block *bm_blocks;     /* list of bitmap blocks */
252         struct bm_block *cur_block;     /* recently used bitmap block */
253 };
254
255 /* strcut bm_position is used for browsing memory bitmaps */
256
257 struct bm_position {
258         struct zone_bitmap *zone_bm;
259         struct bm_block *block;
260         int bit;
261 };
262
263 struct memory_bitmap {
264         struct zone_bitmap *zone_bm_list;       /* list of zone bitmaps */
265         struct linked_page *p_list;     /* list of pages used to store zone
266                                          * bitmap objects and bitmap block
267                                          * objects
268                                          */
269         struct bm_position cur; /* most recently used bit position */
270 };
271
272 /* Functions that operate on memory bitmaps */
273
274 static void memory_bm_position_reset(struct memory_bitmap *bm)
275 {
276         struct zone_bitmap *zone_bm;
277
278         zone_bm = bm->zone_bm_list;
279         bm->cur.zone_bm = zone_bm;
280         bm->cur.block = zone_bm->bm_blocks;
281         bm->cur.bit = 0;
282 }
283
284 static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
285
286 /**
287  *      create_bm_block_list - create a list of block bitmap objects
288  */
289
290 static inline struct bm_block *
291 create_bm_block_list(unsigned int nr_blocks, struct chain_allocator *ca)
292 {
293         struct bm_block *bblist = NULL;
294
295         while (nr_blocks-- > 0) {
296                 struct bm_block *bb;
297
298                 bb = chain_alloc(ca, sizeof(struct bm_block));
299                 if (!bb)
300                         return NULL;
301
302                 bb->next = bblist;
303                 bblist = bb;
304         }
305         return bblist;
306 }
307
308 /**
309  *      create_zone_bm_list - create a list of zone bitmap objects
310  */
311
312 static inline struct zone_bitmap *
313 create_zone_bm_list(unsigned int nr_zones, struct chain_allocator *ca)
314 {
315         struct zone_bitmap *zbmlist = NULL;
316
317         while (nr_zones-- > 0) {
318                 struct zone_bitmap *zbm;
319
320                 zbm = chain_alloc(ca, sizeof(struct zone_bitmap));
321                 if (!zbm)
322                         return NULL;
323
324                 zbm->next = zbmlist;
325                 zbmlist = zbm;
326         }
327         return zbmlist;
328 }
329
330 /**
331   *     memory_bm_create - allocate memory for a memory bitmap
332   */
333
334 static int
335 memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
336 {
337         struct chain_allocator ca;
338         struct zone *zone;
339         struct zone_bitmap *zone_bm;
340         struct bm_block *bb;
341         unsigned int nr;
342
343         chain_init(&ca, gfp_mask, safe_needed);
344
345         /* Compute the number of zones */
346         nr = 0;
347         for_each_zone(zone)
348                 if (populated_zone(zone))
349                         nr++;
350
351         /* Allocate the list of zones bitmap objects */
352         zone_bm = create_zone_bm_list(nr, &ca);
353         bm->zone_bm_list = zone_bm;
354         if (!zone_bm) {
355                 chain_free(&ca, PG_UNSAFE_CLEAR);
356                 return -ENOMEM;
357         }
358
359         /* Initialize the zone bitmap objects */
360         for_each_zone(zone) {
361                 unsigned long pfn;
362
363                 if (!populated_zone(zone))
364                         continue;
365
366                 zone_bm->start_pfn = zone->zone_start_pfn;
367                 zone_bm->end_pfn = zone->zone_start_pfn + zone->spanned_pages;
368                 /* Allocate the list of bitmap block objects */
369                 nr = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
370                 bb = create_bm_block_list(nr, &ca);
371                 zone_bm->bm_blocks = bb;
372                 zone_bm->cur_block = bb;
373                 if (!bb)
374                         goto Free;
375
376                 nr = zone->spanned_pages;
377                 pfn = zone->zone_start_pfn;
378                 /* Initialize the bitmap block objects */
379                 while (bb) {
380                         unsigned long *ptr;
381
382                         ptr = get_image_page(gfp_mask, safe_needed);
383                         bb->data = ptr;
384                         if (!ptr)
385                                 goto Free;
386
387                         bb->start_pfn = pfn;
388                         if (nr >= BM_BITS_PER_BLOCK) {
389                                 pfn += BM_BITS_PER_BLOCK;
390                                 nr -= BM_BITS_PER_BLOCK;
391                         } else {
392                                 /* This is executed only once in the loop */
393                                 pfn += nr;
394                         }
395                         bb->end_pfn = pfn;
396                         bb = bb->next;
397                 }
398                 zone_bm = zone_bm->next;
399         }
400         bm->p_list = ca.chain;
401         memory_bm_position_reset(bm);
402         return 0;
403
404  Free:
405         bm->p_list = ca.chain;
406         memory_bm_free(bm, PG_UNSAFE_CLEAR);
407         return -ENOMEM;
408 }
409
410 /**
411   *     memory_bm_free - free memory occupied by the memory bitmap @bm
412   */
413
414 static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
415 {
416         struct zone_bitmap *zone_bm;
417
418         /* Free the list of bit blocks for each zone_bitmap object */
419         zone_bm = bm->zone_bm_list;
420         while (zone_bm) {
421                 struct bm_block *bb;
422
423                 bb = zone_bm->bm_blocks;
424                 while (bb) {
425                         if (bb->data)
426                                 free_image_page(bb->data, clear_nosave_free);
427                         bb = bb->next;
428                 }
429                 zone_bm = zone_bm->next;
430         }
431         free_list_of_pages(bm->p_list, clear_nosave_free);
432         bm->zone_bm_list = NULL;
433 }
434
435 /**
436  *      memory_bm_find_bit - find the bit in the bitmap @bm that corresponds
437  *      to given pfn.  The cur_zone_bm member of @bm and the cur_block member
438  *      of @bm->cur_zone_bm are updated.
439  */
440
441 static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
442                                 void **addr, unsigned int *bit_nr)
443 {
444         struct zone_bitmap *zone_bm;
445         struct bm_block *bb;
446
447         /* Check if the pfn is from the current zone */
448         zone_bm = bm->cur.zone_bm;
449         if (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) {
450                 zone_bm = bm->zone_bm_list;
451                 /* We don't assume that the zones are sorted by pfns */
452                 while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) {
453                         zone_bm = zone_bm->next;
454
455                         if (!zone_bm)
456                                 return -EFAULT;
457                 }
458                 bm->cur.zone_bm = zone_bm;
459         }
460         /* Check if the pfn corresponds to the current bitmap block */
461         bb = zone_bm->cur_block;
462         if (pfn < bb->start_pfn)
463                 bb = zone_bm->bm_blocks;
464
465         while (pfn >= bb->end_pfn) {
466                 bb = bb->next;
467
468                 BUG_ON(!bb);
469         }
470         zone_bm->cur_block = bb;
471         pfn -= bb->start_pfn;
472         *bit_nr = pfn;
473         *addr = bb->data;
474         return 0;
475 }
476
477 static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn)
478 {
479         void *addr;
480         unsigned int bit;
481         int error;
482
483         error = memory_bm_find_bit(bm, pfn, &addr, &bit);
484         BUG_ON(error);
485         set_bit(bit, addr);
486 }
487
488 static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn)
489 {
490         void *addr;
491         unsigned int bit;
492         int error;
493
494         error = memory_bm_find_bit(bm, pfn, &addr, &bit);
495         if (!error)
496                 set_bit(bit, addr);
497         return error;
498 }
499
500 static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn)
501 {
502         void *addr;
503         unsigned int bit;
504         int error;
505
506         error = memory_bm_find_bit(bm, pfn, &addr, &bit);
507         BUG_ON(error);
508         clear_bit(bit, addr);
509 }
510
511 static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
512 {
513         void *addr;
514         unsigned int bit;
515         int error;
516
517         error = memory_bm_find_bit(bm, pfn, &addr, &bit);
518         BUG_ON(error);
519         return test_bit(bit, addr);
520 }
521
522 /**
523  *      memory_bm_next_pfn - find the pfn that corresponds to the next set bit
524  *      in the bitmap @bm.  If the pfn cannot be found, BM_END_OF_MAP is
525  *      returned.
526  *
527  *      It is required to run memory_bm_position_reset() before the first call to
528  *      this function.
529  */
530
531 static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
532 {
533         struct zone_bitmap *zone_bm;
534         struct bm_block *bb;
535         int bit;
536
537         do {
538                 bb = bm->cur.block;
539                 do {
540                         bit = bm->cur.bit;
541                         bit = find_next_bit(bb->data, bm_block_bits(bb), bit);
542                         if (bit < bm_block_bits(bb))
543                                 goto Return_pfn;
544
545                         bb = bb->next;
546                         bm->cur.block = bb;
547                         bm->cur.bit = 0;
548                 } while (bb);
549                 zone_bm = bm->cur.zone_bm->next;
550                 if (zone_bm) {
551                         bm->cur.zone_bm = zone_bm;
552                         bm->cur.block = zone_bm->bm_blocks;
553                         bm->cur.bit = 0;
554                 }
555         } while (zone_bm);
556         memory_bm_position_reset(bm);
557         return BM_END_OF_MAP;
558
559  Return_pfn:
560         bm->cur.bit = bit + 1;
561         return bb->start_pfn + bit;
562 }
563
564 /**
565  *      This structure represents a range of page frames the contents of which
566  *      should not be saved during the suspend.
567  */
568
569 struct nosave_region {
570         struct list_head list;
571         unsigned long start_pfn;
572         unsigned long end_pfn;
573 };
574
575 static LIST_HEAD(nosave_regions);
576
577 /**
578  *      register_nosave_region - register a range of page frames the contents
579  *      of which should not be saved during the suspend (to be used in the early
580  *      initialization code)
581  */
582
583 void __init
584 __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn,
585                          int use_kmalloc)
586 {
587         struct nosave_region *region;
588
589         if (start_pfn >= end_pfn)
590                 return;
591
592         if (!list_empty(&nosave_regions)) {
593                 /* Try to extend the previous region (they should be sorted) */
594                 region = list_entry(nosave_regions.prev,
595                                         struct nosave_region, list);
596                 if (region->end_pfn == start_pfn) {
597                         region->end_pfn = end_pfn;
598                         goto Report;
599                 }
600         }
601         if (use_kmalloc) {
602                 /* during init, this shouldn't fail */
603                 region = kmalloc(sizeof(struct nosave_region), GFP_KERNEL);
604                 BUG_ON(!region);
605         } else
606                 /* This allocation cannot fail */
607                 region = alloc_bootmem_low(sizeof(struct nosave_region));
608         region->start_pfn = start_pfn;
609         region->end_pfn = end_pfn;
610         list_add_tail(&region->list, &nosave_regions);
611  Report:
612         printk(KERN_INFO "PM: Registered nosave memory: %016lx - %016lx\n",
613                 start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
614 }
615
616 /*
617  * Set bits in this map correspond to the page frames the contents of which
618  * should not be saved during the suspend.
619  */
620 static struct memory_bitmap *forbidden_pages_map;
621
622 /* Set bits in this map correspond to free page frames. */
623 static struct memory_bitmap *free_pages_map;
624
625 /*
626  * Each page frame allocated for creating the image is marked by setting the
627  * corresponding bits in forbidden_pages_map and free_pages_map simultaneously
628  */
629
630 void swsusp_set_page_free(struct page *page)
631 {
632         if (free_pages_map)
633                 memory_bm_set_bit(free_pages_map, page_to_pfn(page));
634 }
635
636 static int swsusp_page_is_free(struct page *page)
637 {
638         return free_pages_map ?
639                 memory_bm_test_bit(free_pages_map, page_to_pfn(page)) : 0;
640 }
641
642 void swsusp_unset_page_free(struct page *page)
643 {
644         if (free_pages_map)
645                 memory_bm_clear_bit(free_pages_map, page_to_pfn(page));
646 }
647
648 static void swsusp_set_page_forbidden(struct page *page)
649 {
650         if (forbidden_pages_map)
651                 memory_bm_set_bit(forbidden_pages_map, page_to_pfn(page));
652 }
653
654 int swsusp_page_is_forbidden(struct page *page)
655 {
656         return forbidden_pages_map ?
657                 memory_bm_test_bit(forbidden_pages_map, page_to_pfn(page)) : 0;
658 }
659
660 static void swsusp_unset_page_forbidden(struct page *page)
661 {
662         if (forbidden_pages_map)
663                 memory_bm_clear_bit(forbidden_pages_map, page_to_pfn(page));
664 }
665
666 /**
667  *      mark_nosave_pages - set bits corresponding to the page frames the
668  *      contents of which should not be saved in a given bitmap.
669  */
670
671 static void mark_nosave_pages(struct memory_bitmap *bm)
672 {
673         struct nosave_region *region;
674
675         if (list_empty(&nosave_regions))
676                 return;
677
678         list_for_each_entry(region, &nosave_regions, list) {
679                 unsigned long pfn;
680
681                 pr_debug("PM: Marking nosave pages: %016lx - %016lx\n",
682                                 region->start_pfn << PAGE_SHIFT,
683                                 region->end_pfn << PAGE_SHIFT);
684
685                 for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++)
686                         if (pfn_valid(pfn)) {
687                                 /*
688                                  * It is safe to ignore the result of
689                                  * mem_bm_set_bit_check() here, since we won't
690                                  * touch the PFNs for which the error is
691                                  * returned anyway.
692                                  */
693                                 mem_bm_set_bit_check(bm, pfn);
694                         }
695         }
696 }
697
698 /**
699  *      create_basic_memory_bitmaps - create bitmaps needed for marking page
700  *      frames that should not be saved and free page frames.  The pointers
701  *      forbidden_pages_map and free_pages_map are only modified if everything
702  *      goes well, because we don't want the bits to be used before both bitmaps
703  *      are set up.
704  */
705
706 int create_basic_memory_bitmaps(void)
707 {
708         struct memory_bitmap *bm1, *bm2;
709         int error = 0;
710
711         BUG_ON(forbidden_pages_map || free_pages_map);
712
713         bm1 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL);
714         if (!bm1)
715                 return -ENOMEM;
716
717         error = memory_bm_create(bm1, GFP_KERNEL, PG_ANY);
718         if (error)
719                 goto Free_first_object;
720
721         bm2 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL);
722         if (!bm2)
723                 goto Free_first_bitmap;
724
725         error = memory_bm_create(bm2, GFP_KERNEL, PG_ANY);
726         if (error)
727                 goto Free_second_object;
728
729         forbidden_pages_map = bm1;
730         free_pages_map = bm2;
731         mark_nosave_pages(forbidden_pages_map);
732
733         pr_debug("PM: Basic memory bitmaps created\n");
734
735         return 0;
736
737  Free_second_object:
738         kfree(bm2);
739  Free_first_bitmap:
740         memory_bm_free(bm1, PG_UNSAFE_CLEAR);
741  Free_first_object:
742         kfree(bm1);
743         return -ENOMEM;
744 }
745
746 /**
747  *      free_basic_memory_bitmaps - free memory bitmaps allocated by
748  *      create_basic_memory_bitmaps().  The auxiliary pointers are necessary
749  *      so that the bitmaps themselves are not referred to while they are being
750  *      freed.
751  */
752
753 void free_basic_memory_bitmaps(void)
754 {
755         struct memory_bitmap *bm1, *bm2;
756
757         BUG_ON(!(forbidden_pages_map && free_pages_map));
758
759         bm1 = forbidden_pages_map;
760         bm2 = free_pages_map;
761         forbidden_pages_map = NULL;
762         free_pages_map = NULL;
763         memory_bm_free(bm1, PG_UNSAFE_CLEAR);
764         kfree(bm1);
765         memory_bm_free(bm2, PG_UNSAFE_CLEAR);
766         kfree(bm2);
767
768         pr_debug("PM: Basic memory bitmaps freed\n");
769 }
770
771 /**
772  *      snapshot_additional_pages - estimate the number of additional pages
773  *      be needed for setting up the suspend image data structures for given
774  *      zone (usually the returned value is greater than the exact number)
775  */
776
777 unsigned int snapshot_additional_pages(struct zone *zone)
778 {
779         unsigned int res;
780
781         res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
782         res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE);
783         return 2 * res;
784 }
785
786 #ifdef CONFIG_HIGHMEM
787 /**
788  *      count_free_highmem_pages - compute the total number of free highmem
789  *      pages, system-wide.
790  */
791
792 static unsigned int count_free_highmem_pages(void)
793 {
794         struct zone *zone;
795         unsigned int cnt = 0;
796
797         for_each_zone(zone)
798                 if (populated_zone(zone) && is_highmem(zone))
799                         cnt += zone_page_state(zone, NR_FREE_PAGES);
800
801         return cnt;
802 }
803
804 /**
805  *      saveable_highmem_page - Determine whether a highmem page should be
806  *      included in the suspend image.
807  *
808  *      We should save the page if it isn't Nosave or NosaveFree, or Reserved,
809  *      and it isn't a part of a free chunk of pages.
810  */
811
812 static struct page *saveable_highmem_page(unsigned long pfn)
813 {
814         struct page *page;
815
816         if (!pfn_valid(pfn))
817                 return NULL;
818
819         page = pfn_to_page(pfn);
820
821         BUG_ON(!PageHighMem(page));
822
823         if (swsusp_page_is_forbidden(page) ||  swsusp_page_is_free(page) ||
824             PageReserved(page))
825                 return NULL;
826
827         return page;
828 }
829
830 /**
831  *      count_highmem_pages - compute the total number of saveable highmem
832  *      pages.
833  */
834
835 unsigned int count_highmem_pages(void)
836 {
837         struct zone *zone;
838         unsigned int n = 0;
839
840         for_each_zone(zone) {
841                 unsigned long pfn, max_zone_pfn;
842
843                 if (!is_highmem(zone))
844                         continue;
845
846                 mark_free_pages(zone);
847                 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
848                 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
849                         if (saveable_highmem_page(pfn))
850                                 n++;
851         }
852         return n;
853 }
854 #else
855 static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; }
856 #endif /* CONFIG_HIGHMEM */
857
858 /**
859  *      saveable_page - Determine whether a non-highmem page should be included
860  *      in the suspend image.
861  *
862  *      We should save the page if it isn't Nosave, and is not in the range
863  *      of pages statically defined as 'unsaveable', and it isn't a part of
864  *      a free chunk of pages.
865  */
866
867 static struct page *saveable_page(unsigned long pfn)
868 {
869         struct page *page;
870
871         if (!pfn_valid(pfn))
872                 return NULL;
873
874         page = pfn_to_page(pfn);
875
876         BUG_ON(PageHighMem(page));
877
878         if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page))
879                 return NULL;
880
881         if (PageReserved(page)
882             && (!kernel_page_present(page) || pfn_is_nosave(pfn)))
883                 return NULL;
884
885         return page;
886 }
887
888 /**
889  *      count_data_pages - compute the total number of saveable non-highmem
890  *      pages.
891  */
892
893 unsigned int count_data_pages(void)
894 {
895         struct zone *zone;
896         unsigned long pfn, max_zone_pfn;
897         unsigned int n = 0;
898
899         for_each_zone(zone) {
900                 if (is_highmem(zone))
901                         continue;
902
903                 mark_free_pages(zone);
904                 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
905                 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
906                         if(saveable_page(pfn))
907                                 n++;
908         }
909         return n;
910 }
911
912 /* This is needed, because copy_page and memcpy are not usable for copying
913  * task structs.
914  */
915 static inline void do_copy_page(long *dst, long *src)
916 {
917         int n;
918
919         for (n = PAGE_SIZE / sizeof(long); n; n--)
920                 *dst++ = *src++;
921 }
922
923
924 /**
925  *      safe_copy_page - check if the page we are going to copy is marked as
926  *              present in the kernel page tables (this always is the case if
927  *              CONFIG_DEBUG_PAGEALLOC is not set and in that case
928  *              kernel_page_present() always returns 'true').
929  */
930 static void safe_copy_page(void *dst, struct page *s_page)
931 {
932         if (kernel_page_present(s_page)) {
933                 do_copy_page(dst, page_address(s_page));
934         } else {
935                 kernel_map_pages(s_page, 1, 1);
936                 do_copy_page(dst, page_address(s_page));
937                 kernel_map_pages(s_page, 1, 0);
938         }
939 }
940
941
942 #ifdef CONFIG_HIGHMEM
943 static inline struct page *
944 page_is_saveable(struct zone *zone, unsigned long pfn)
945 {
946         return is_highmem(zone) ?
947                         saveable_highmem_page(pfn) : saveable_page(pfn);
948 }
949
950 static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
951 {
952         struct page *s_page, *d_page;
953         void *src, *dst;
954
955         s_page = pfn_to_page(src_pfn);
956         d_page = pfn_to_page(dst_pfn);
957         if (PageHighMem(s_page)) {
958                 src = kmap_atomic(s_page, KM_USER0);
959                 dst = kmap_atomic(d_page, KM_USER1);
960                 do_copy_page(dst, src);
961                 kunmap_atomic(src, KM_USER0);
962                 kunmap_atomic(dst, KM_USER1);
963         } else {
964                 if (PageHighMem(d_page)) {
965                         /* Page pointed to by src may contain some kernel
966                          * data modified by kmap_atomic()
967                          */
968                         safe_copy_page(buffer, s_page);
969                         dst = kmap_atomic(pfn_to_page(dst_pfn), KM_USER0);
970                         memcpy(dst, buffer, PAGE_SIZE);
971                         kunmap_atomic(dst, KM_USER0);
972                 } else {
973                         safe_copy_page(page_address(d_page), s_page);
974                 }
975         }
976 }
977 #else
978 #define page_is_saveable(zone, pfn)     saveable_page(pfn)
979
980 static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
981 {
982         safe_copy_page(page_address(pfn_to_page(dst_pfn)),
983                                 pfn_to_page(src_pfn));
984 }
985 #endif /* CONFIG_HIGHMEM */
986
987 static void
988 copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
989 {
990         struct zone *zone;
991         unsigned long pfn;
992
993         for_each_zone(zone) {
994                 unsigned long max_zone_pfn;
995
996                 mark_free_pages(zone);
997                 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
998                 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
999                         if (page_is_saveable(zone, pfn))
1000                                 memory_bm_set_bit(orig_bm, pfn);
1001         }
1002         memory_bm_position_reset(orig_bm);
1003         memory_bm_position_reset(copy_bm);
1004         for(;;) {
1005                 pfn = memory_bm_next_pfn(orig_bm);
1006                 if (unlikely(pfn == BM_END_OF_MAP))
1007                         break;
1008                 copy_data_page(memory_bm_next_pfn(copy_bm), pfn);
1009         }
1010 }
1011
1012 /* Total number of image pages */
1013 static unsigned int nr_copy_pages;
1014 /* Number of pages needed for saving the original pfns of the image pages */
1015 static unsigned int nr_meta_pages;
1016
1017 /**
1018  *      swsusp_free - free pages allocated for the suspend.
1019  *
1020  *      Suspend pages are alocated before the atomic copy is made, so we
1021  *      need to release them after the resume.
1022  */
1023
1024 void swsusp_free(void)
1025 {
1026         struct zone *zone;
1027         unsigned long pfn, max_zone_pfn;
1028
1029         for_each_zone(zone) {
1030                 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
1031                 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1032                         if (pfn_valid(pfn)) {
1033                                 struct page *page = pfn_to_page(pfn);
1034
1035                                 if (swsusp_page_is_forbidden(page) &&
1036                                     swsusp_page_is_free(page)) {
1037                                         swsusp_unset_page_forbidden(page);
1038                                         swsusp_unset_page_free(page);
1039                                         __free_page(page);
1040                                 }
1041                         }
1042         }
1043         nr_copy_pages = 0;
1044         nr_meta_pages = 0;
1045         restore_pblist = NULL;
1046         buffer = NULL;
1047 }
1048
1049 #ifdef CONFIG_HIGHMEM
1050 /**
1051   *     count_pages_for_highmem - compute the number of non-highmem pages
1052   *     that will be necessary for creating copies of highmem pages.
1053   */
1054
1055 static unsigned int count_pages_for_highmem(unsigned int nr_highmem)
1056 {
1057         unsigned int free_highmem = count_free_highmem_pages();
1058
1059         if (free_highmem >= nr_highmem)
1060                 nr_highmem = 0;
1061         else
1062                 nr_highmem -= free_highmem;
1063
1064         return nr_highmem;
1065 }
1066 #else
1067 static unsigned int
1068 count_pages_for_highmem(unsigned int nr_highmem) { return 0; }
1069 #endif /* CONFIG_HIGHMEM */
1070
1071 /**
1072  *      enough_free_mem - Make sure we have enough free memory for the
1073  *      snapshot image.
1074  */
1075
1076 static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
1077 {
1078         struct zone *zone;
1079         unsigned int free = 0, meta = 0;
1080
1081         for_each_zone(zone) {
1082                 meta += snapshot_additional_pages(zone);
1083                 if (!is_highmem(zone))
1084                         free += zone_page_state(zone, NR_FREE_PAGES);
1085         }
1086
1087         nr_pages += count_pages_for_highmem(nr_highmem);
1088         pr_debug("PM: Normal pages needed: %u + %u + %u, available pages: %u\n",
1089                 nr_pages, PAGES_FOR_IO, meta, free);
1090
1091         return free > nr_pages + PAGES_FOR_IO + meta;
1092 }
1093
1094 #ifdef CONFIG_HIGHMEM
1095 /**
1096  *      get_highmem_buffer - if there are some highmem pages in the suspend
1097  *      image, we may need the buffer to copy them and/or load their data.
1098  */
1099
1100 static inline int get_highmem_buffer(int safe_needed)
1101 {
1102         buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed);
1103         return buffer ? 0 : -ENOMEM;
1104 }
1105
1106 /**
1107  *      alloc_highmem_image_pages - allocate some highmem pages for the image.
1108  *      Try to allocate as many pages as needed, but if the number of free
1109  *      highmem pages is lesser than that, allocate them all.
1110  */
1111
1112 static inline unsigned int
1113 alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
1114 {
1115         unsigned int to_alloc = count_free_highmem_pages();
1116
1117         if (to_alloc > nr_highmem)
1118                 to_alloc = nr_highmem;
1119
1120         nr_highmem -= to_alloc;
1121         while (to_alloc-- > 0) {
1122                 struct page *page;
1123
1124                 page = alloc_image_page(__GFP_HIGHMEM);
1125                 memory_bm_set_bit(bm, page_to_pfn(page));
1126         }
1127         return nr_highmem;
1128 }
1129 #else
1130 static inline int get_highmem_buffer(int safe_needed) { return 0; }
1131
1132 static inline unsigned int
1133 alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int n) { return 0; }
1134 #endif /* CONFIG_HIGHMEM */
1135
1136 /**
1137  *      swsusp_alloc - allocate memory for the suspend image
1138  *
1139  *      We first try to allocate as many highmem pages as there are
1140  *      saveable highmem pages in the system.  If that fails, we allocate
1141  *      non-highmem pages for the copies of the remaining highmem ones.
1142  *
1143  *      In this approach it is likely that the copies of highmem pages will
1144  *      also be located in the high memory, because of the way in which
1145  *      copy_data_pages() works.
1146  */
1147
1148 static int
1149 swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
1150                 unsigned int nr_pages, unsigned int nr_highmem)
1151 {
1152         int error;
1153
1154         error = memory_bm_create(orig_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY);
1155         if (error)
1156                 goto Free;
1157
1158         error = memory_bm_create(copy_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY);
1159         if (error)
1160                 goto Free;
1161
1162         if (nr_highmem > 0) {
1163                 error = get_highmem_buffer(PG_ANY);
1164                 if (error)
1165                         goto Free;
1166
1167                 nr_pages += alloc_highmem_image_pages(copy_bm, nr_highmem);
1168         }
1169         while (nr_pages-- > 0) {
1170                 struct page *page = alloc_image_page(GFP_ATOMIC | __GFP_COLD);
1171
1172                 if (!page)
1173                         goto Free;
1174
1175                 memory_bm_set_bit(copy_bm, page_to_pfn(page));
1176         }
1177         return 0;
1178
1179  Free:
1180         swsusp_free();
1181         return -ENOMEM;
1182 }
1183
1184 /* Memory bitmap used for marking saveable pages (during suspend) or the
1185  * suspend image pages (during resume)
1186  */
1187 static struct memory_bitmap orig_bm;
1188 /* Memory bitmap used on suspend for marking allocated pages that will contain
1189  * the copies of saveable pages.  During resume it is initially used for
1190  * marking the suspend image pages, but then its set bits are duplicated in
1191  * @orig_bm and it is released.  Next, on systems with high memory, it may be
1192  * used for marking "safe" highmem pages, but it has to be reinitialized for
1193  * this purpose.
1194  */
1195 static struct memory_bitmap copy_bm;
1196
1197 asmlinkage int swsusp_save(void)
1198 {
1199         unsigned int nr_pages, nr_highmem;
1200
1201         printk(KERN_INFO "PM: Creating hibernation image: \n");
1202
1203         drain_local_pages(NULL);
1204         nr_pages = count_data_pages();
1205         nr_highmem = count_highmem_pages();
1206         printk(KERN_INFO "PM: Need to copy %u pages\n", nr_pages + nr_highmem);
1207
1208         if (!enough_free_mem(nr_pages, nr_highmem)) {
1209                 printk(KERN_ERR "PM: Not enough free memory\n");
1210                 return -ENOMEM;
1211         }
1212
1213         if (swsusp_alloc(&orig_bm, &copy_bm, nr_pages, nr_highmem)) {
1214                 printk(KERN_ERR "PM: Memory allocation failed\n");
1215                 return -ENOMEM;
1216         }
1217
1218         /* During allocating of suspend pagedir, new cold pages may appear.
1219          * Kill them.
1220          */
1221         drain_local_pages(NULL);
1222         copy_data_pages(&copy_bm, &orig_bm);
1223
1224         /*
1225          * End of critical section. From now on, we can write to memory,
1226          * but we should not touch disk. This specially means we must _not_
1227          * touch swap space! Except we must write out our image of course.
1228          */
1229
1230         nr_pages += nr_highmem;
1231         nr_copy_pages = nr_pages;
1232         nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE);
1233
1234         printk(KERN_INFO "PM: Hibernation image created (%d pages copied)\n",
1235                 nr_pages);
1236
1237         return 0;
1238 }
1239
1240 #ifndef CONFIG_ARCH_HIBERNATION_HEADER
1241 static int init_header_complete(struct swsusp_info *info)
1242 {
1243         memcpy(&info->uts, init_utsname(), sizeof(struct new_utsname));
1244         info->version_code = LINUX_VERSION_CODE;
1245         return 0;
1246 }
1247
1248 static char *check_image_kernel(struct swsusp_info *info)
1249 {
1250         if (info->version_code != LINUX_VERSION_CODE)
1251                 return "kernel version";
1252         if (strcmp(info->uts.sysname,init_utsname()->sysname))
1253                 return "system type";
1254         if (strcmp(info->uts.release,init_utsname()->release))
1255                 return "kernel release";
1256         if (strcmp(info->uts.version,init_utsname()->version))
1257                 return "version";
1258         if (strcmp(info->uts.machine,init_utsname()->machine))
1259                 return "machine";
1260         return NULL;
1261 }
1262 #endif /* CONFIG_ARCH_HIBERNATION_HEADER */
1263
1264 unsigned long snapshot_get_image_size(void)
1265 {
1266         return nr_copy_pages + nr_meta_pages + 1;
1267 }
1268
1269 static int init_header(struct swsusp_info *info)
1270 {
1271         memset(info, 0, sizeof(struct swsusp_info));
1272         info->num_physpages = num_physpages;
1273         info->image_pages = nr_copy_pages;
1274         info->pages = snapshot_get_image_size();
1275         info->size = info->pages;
1276         info->size <<= PAGE_SHIFT;
1277         return init_header_complete(info);
1278 }
1279
1280 /**
1281  *      pack_pfns - pfns corresponding to the set bits found in the bitmap @bm
1282  *      are stored in the array @buf[] (1 page at a time)
1283  */
1284
1285 static inline void
1286 pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
1287 {
1288         int j;
1289
1290         for (j = 0; j < PAGE_SIZE / sizeof(long); j++) {
1291                 buf[j] = memory_bm_next_pfn(bm);
1292                 if (unlikely(buf[j] == BM_END_OF_MAP))
1293                         break;
1294         }
1295 }
1296
1297 /**
1298  *      snapshot_read_next - used for reading the system memory snapshot.
1299  *
1300  *      On the first call to it @handle should point to a zeroed
1301  *      snapshot_handle structure.  The structure gets updated and a pointer
1302  *      to it should be passed to this function every next time.
1303  *
1304  *      The @count parameter should contain the number of bytes the caller
1305  *      wants to read from the snapshot.  It must not be zero.
1306  *
1307  *      On success the function returns a positive number.  Then, the caller
1308  *      is allowed to read up to the returned number of bytes from the memory
1309  *      location computed by the data_of() macro.  The number returned
1310  *      may be smaller than @count, but this only happens if the read would
1311  *      cross a page boundary otherwise.
1312  *
1313  *      The function returns 0 to indicate the end of data stream condition,
1314  *      and a negative number is returned on error.  In such cases the
1315  *      structure pointed to by @handle is not updated and should not be used
1316  *      any more.
1317  */
1318
1319 int snapshot_read_next(struct snapshot_handle *handle, size_t count)
1320 {
1321         if (handle->cur > nr_meta_pages + nr_copy_pages)
1322                 return 0;
1323
1324         if (!buffer) {
1325                 /* This makes the buffer be freed by swsusp_free() */
1326                 buffer = get_image_page(GFP_ATOMIC, PG_ANY);
1327                 if (!buffer)
1328                         return -ENOMEM;
1329         }
1330         if (!handle->offset) {
1331                 int error;
1332
1333                 error = init_header((struct swsusp_info *)buffer);
1334                 if (error)
1335                         return error;
1336                 handle->buffer = buffer;
1337                 memory_bm_position_reset(&orig_bm);
1338                 memory_bm_position_reset(&copy_bm);
1339         }
1340         if (handle->prev < handle->cur) {
1341                 if (handle->cur <= nr_meta_pages) {
1342                         memset(buffer, 0, PAGE_SIZE);
1343                         pack_pfns(buffer, &orig_bm);
1344                 } else {
1345                         struct page *page;
1346
1347                         page = pfn_to_page(memory_bm_next_pfn(&copy_bm));
1348                         if (PageHighMem(page)) {
1349                                 /* Highmem pages are copied to the buffer,
1350                                  * because we can't return with a kmapped
1351                                  * highmem page (we may not be called again).
1352                                  */
1353                                 void *kaddr;
1354
1355                                 kaddr = kmap_atomic(page, KM_USER0);
1356                                 memcpy(buffer, kaddr, PAGE_SIZE);
1357                                 kunmap_atomic(kaddr, KM_USER0);
1358                                 handle->buffer = buffer;
1359                         } else {
1360                                 handle->buffer = page_address(page);
1361                         }
1362                 }
1363                 handle->prev = handle->cur;
1364         }
1365         handle->buf_offset = handle->cur_offset;
1366         if (handle->cur_offset + count >= PAGE_SIZE) {
1367                 count = PAGE_SIZE - handle->cur_offset;
1368                 handle->cur_offset = 0;
1369                 handle->cur++;
1370         } else {
1371                 handle->cur_offset += count;
1372         }
1373         handle->offset += count;
1374         return count;
1375 }
1376
1377 /**
1378  *      mark_unsafe_pages - mark the pages that cannot be used for storing
1379  *      the image during resume, because they conflict with the pages that
1380  *      had been used before suspend
1381  */
1382
1383 static int mark_unsafe_pages(struct memory_bitmap *bm)
1384 {
1385         struct zone *zone;
1386         unsigned long pfn, max_zone_pfn;
1387
1388         /* Clear page flags */
1389         for_each_zone(zone) {
1390                 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
1391                 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1392                         if (pfn_valid(pfn))
1393                                 swsusp_unset_page_free(pfn_to_page(pfn));
1394         }
1395
1396         /* Mark pages that correspond to the "original" pfns as "unsafe" */
1397         memory_bm_position_reset(bm);
1398         do {
1399                 pfn = memory_bm_next_pfn(bm);
1400                 if (likely(pfn != BM_END_OF_MAP)) {
1401                         if (likely(pfn_valid(pfn)))
1402                                 swsusp_set_page_free(pfn_to_page(pfn));
1403                         else
1404                                 return -EFAULT;
1405                 }
1406         } while (pfn != BM_END_OF_MAP);
1407
1408         allocated_unsafe_pages = 0;
1409
1410         return 0;
1411 }
1412
1413 static void
1414 duplicate_memory_bitmap(struct memory_bitmap *dst, struct memory_bitmap *src)
1415 {
1416         unsigned long pfn;
1417
1418         memory_bm_position_reset(src);
1419         pfn = memory_bm_next_pfn(src);
1420         while (pfn != BM_END_OF_MAP) {
1421                 memory_bm_set_bit(dst, pfn);
1422                 pfn = memory_bm_next_pfn(src);
1423         }
1424 }
1425
1426 static int check_header(struct swsusp_info *info)
1427 {
1428         char *reason;
1429
1430         reason = check_image_kernel(info);
1431         if (!reason && info->num_physpages != num_physpages)
1432                 reason = "memory size";
1433         if (reason) {
1434                 printk(KERN_ERR "PM: Image mismatch: %s\n", reason);
1435                 return -EPERM;
1436         }
1437         return 0;
1438 }
1439
1440 /**
1441  *      load header - check the image header and copy data from it
1442  */
1443
1444 static int
1445 load_header(struct swsusp_info *info)
1446 {
1447         int error;
1448
1449         restore_pblist = NULL;
1450         error = check_header(info);
1451         if (!error) {
1452                 nr_copy_pages = info->image_pages;
1453                 nr_meta_pages = info->pages - info->image_pages - 1;
1454         }
1455         return error;
1456 }
1457
1458 /**
1459  *      unpack_orig_pfns - for each element of @buf[] (1 page at a time) set
1460  *      the corresponding bit in the memory bitmap @bm
1461  */
1462
1463 static inline void
1464 unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
1465 {
1466         int j;
1467
1468         for (j = 0; j < PAGE_SIZE / sizeof(long); j++) {
1469                 if (unlikely(buf[j] == BM_END_OF_MAP))
1470                         break;
1471
1472                 memory_bm_set_bit(bm, buf[j]);
1473         }
1474 }
1475
1476 /* List of "safe" pages that may be used to store data loaded from the suspend
1477  * image
1478  */
1479 static struct linked_page *safe_pages_list;
1480
1481 #ifdef CONFIG_HIGHMEM
1482 /* struct highmem_pbe is used for creating the list of highmem pages that
1483  * should be restored atomically during the resume from disk, because the page
1484  * frames they have occupied before the suspend are in use.
1485  */
1486 struct highmem_pbe {
1487         struct page *copy_page; /* data is here now */
1488         struct page *orig_page; /* data was here before the suspend */
1489         struct highmem_pbe *next;
1490 };
1491
1492 /* List of highmem PBEs needed for restoring the highmem pages that were
1493  * allocated before the suspend and included in the suspend image, but have
1494  * also been allocated by the "resume" kernel, so their contents cannot be
1495  * written directly to their "original" page frames.
1496  */
1497 static struct highmem_pbe *highmem_pblist;
1498
1499 /**
1500  *      count_highmem_image_pages - compute the number of highmem pages in the
1501  *      suspend image.  The bits in the memory bitmap @bm that correspond to the
1502  *      image pages are assumed to be set.
1503  */
1504
1505 static unsigned int count_highmem_image_pages(struct memory_bitmap *bm)
1506 {
1507         unsigned long pfn;
1508         unsigned int cnt = 0;
1509
1510         memory_bm_position_reset(bm);
1511         pfn = memory_bm_next_pfn(bm);
1512         while (pfn != BM_END_OF_MAP) {
1513                 if (PageHighMem(pfn_to_page(pfn)))
1514                         cnt++;
1515
1516                 pfn = memory_bm_next_pfn(bm);
1517         }
1518         return cnt;
1519 }
1520
1521 /**
1522  *      prepare_highmem_image - try to allocate as many highmem pages as
1523  *      there are highmem image pages (@nr_highmem_p points to the variable
1524  *      containing the number of highmem image pages).  The pages that are
1525  *      "safe" (ie. will not be overwritten when the suspend image is
1526  *      restored) have the corresponding bits set in @bm (it must be
1527  *      unitialized).
1528  *
1529  *      NOTE: This function should not be called if there are no highmem
1530  *      image pages.
1531  */
1532
1533 static unsigned int safe_highmem_pages;
1534
1535 static struct memory_bitmap *safe_highmem_bm;
1536
1537 static int
1538 prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
1539 {
1540         unsigned int to_alloc;
1541
1542         if (memory_bm_create(bm, GFP_ATOMIC, PG_SAFE))
1543                 return -ENOMEM;
1544
1545         if (get_highmem_buffer(PG_SAFE))
1546                 return -ENOMEM;
1547
1548         to_alloc = count_free_highmem_pages();
1549         if (to_alloc > *nr_highmem_p)
1550                 to_alloc = *nr_highmem_p;
1551         else
1552                 *nr_highmem_p = to_alloc;
1553
1554         safe_highmem_pages = 0;
1555         while (to_alloc-- > 0) {
1556                 struct page *page;
1557
1558                 page = alloc_page(__GFP_HIGHMEM);
1559                 if (!swsusp_page_is_free(page)) {
1560                         /* The page is "safe", set its bit the bitmap */
1561                         memory_bm_set_bit(bm, page_to_pfn(page));
1562                         safe_highmem_pages++;
1563                 }
1564                 /* Mark the page as allocated */
1565                 swsusp_set_page_forbidden(page);
1566                 swsusp_set_page_free(page);
1567         }
1568         memory_bm_position_reset(bm);
1569         safe_highmem_bm = bm;
1570         return 0;
1571 }
1572
1573 /**
1574  *      get_highmem_page_buffer - for given highmem image page find the buffer
1575  *      that suspend_write_next() should set for its caller to write to.
1576  *
1577  *      If the page is to be saved to its "original" page frame or a copy of
1578  *      the page is to be made in the highmem, @buffer is returned.  Otherwise,
1579  *      the copy of the page is to be made in normal memory, so the address of
1580  *      the copy is returned.
1581  *
1582  *      If @buffer is returned, the caller of suspend_write_next() will write
1583  *      the page's contents to @buffer, so they will have to be copied to the
1584  *      right location on the next call to suspend_write_next() and it is done
1585  *      with the help of copy_last_highmem_page().  For this purpose, if
1586  *      @buffer is returned, @last_highmem page is set to the page to which
1587  *      the data will have to be copied from @buffer.
1588  */
1589
1590 static struct page *last_highmem_page;
1591
1592 static void *
1593 get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
1594 {
1595         struct highmem_pbe *pbe;
1596         void *kaddr;
1597
1598         if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) {
1599                 /* We have allocated the "original" page frame and we can
1600                  * use it directly to store the loaded page.
1601                  */
1602                 last_highmem_page = page;
1603                 return buffer;
1604         }
1605         /* The "original" page frame has not been allocated and we have to
1606          * use a "safe" page frame to store the loaded page.
1607          */
1608         pbe = chain_alloc(ca, sizeof(struct highmem_pbe));
1609         if (!pbe) {
1610                 swsusp_free();
1611                 return NULL;
1612         }
1613         pbe->orig_page = page;
1614         if (safe_highmem_pages > 0) {
1615                 struct page *tmp;
1616
1617                 /* Copy of the page will be stored in high memory */
1618                 kaddr = buffer;
1619                 tmp = pfn_to_page(memory_bm_next_pfn(safe_highmem_bm));
1620                 safe_highmem_pages--;
1621                 last_highmem_page = tmp;
1622                 pbe->copy_page = tmp;
1623         } else {
1624                 /* Copy of the page will be stored in normal memory */
1625                 kaddr = safe_pages_list;
1626                 safe_pages_list = safe_pages_list->next;
1627                 pbe->copy_page = virt_to_page(kaddr);
1628         }
1629         pbe->next = highmem_pblist;
1630         highmem_pblist = pbe;
1631         return kaddr;
1632 }
1633
1634 /**
1635  *      copy_last_highmem_page - copy the contents of a highmem image from
1636  *      @buffer, where the caller of snapshot_write_next() has place them,
1637  *      to the right location represented by @last_highmem_page .
1638  */
1639
1640 static void copy_last_highmem_page(void)
1641 {
1642         if (last_highmem_page) {
1643                 void *dst;
1644
1645                 dst = kmap_atomic(last_highmem_page, KM_USER0);
1646                 memcpy(dst, buffer, PAGE_SIZE);
1647                 kunmap_atomic(dst, KM_USER0);
1648                 last_highmem_page = NULL;
1649         }
1650 }
1651
1652 static inline int last_highmem_page_copied(void)
1653 {
1654         return !last_highmem_page;
1655 }
1656
1657 static inline void free_highmem_data(void)
1658 {
1659         if (safe_highmem_bm)
1660                 memory_bm_free(safe_highmem_bm, PG_UNSAFE_CLEAR);
1661
1662         if (buffer)
1663                 free_image_page(buffer, PG_UNSAFE_CLEAR);
1664 }
1665 #else
1666 static inline int get_safe_write_buffer(void) { return 0; }
1667
1668 static unsigned int
1669 count_highmem_image_pages(struct memory_bitmap *bm) { return 0; }
1670
1671 static inline int
1672 prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
1673 {
1674         return 0;
1675 }
1676
1677 static inline void *
1678 get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
1679 {
1680         return NULL;
1681 }
1682
1683 static inline void copy_last_highmem_page(void) {}
1684 static inline int last_highmem_page_copied(void) { return 1; }
1685 static inline void free_highmem_data(void) {}
1686 #endif /* CONFIG_HIGHMEM */
1687
1688 /**
1689  *      prepare_image - use the memory bitmap @bm to mark the pages that will
1690  *      be overwritten in the process of restoring the system memory state
1691  *      from the suspend image ("unsafe" pages) and allocate memory for the
1692  *      image.
1693  *
1694  *      The idea is to allocate a new memory bitmap first and then allocate
1695  *      as many pages as needed for the image data, but not to assign these
1696  *      pages to specific tasks initially.  Instead, we just mark them as
1697  *      allocated and create a lists of "safe" pages that will be used
1698  *      later.  On systems with high memory a list of "safe" highmem pages is
1699  *      also created.
1700  */
1701
1702 #define PBES_PER_LINKED_PAGE    (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe))
1703
1704 static int
1705 prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
1706 {
1707         unsigned int nr_pages, nr_highmem;
1708         struct linked_page *sp_list, *lp;
1709         int error;
1710
1711         /* If there is no highmem, the buffer will not be necessary */
1712         free_image_page(buffer, PG_UNSAFE_CLEAR);
1713         buffer = NULL;
1714
1715         nr_highmem = count_highmem_image_pages(bm);
1716         error = mark_unsafe_pages(bm);
1717         if (error)
1718                 goto Free;
1719
1720         error = memory_bm_create(new_bm, GFP_ATOMIC, PG_SAFE);
1721         if (error)
1722                 goto Free;
1723
1724         duplicate_memory_bitmap(new_bm, bm);
1725         memory_bm_free(bm, PG_UNSAFE_KEEP);
1726         if (nr_highmem > 0) {
1727                 error = prepare_highmem_image(bm, &nr_highmem);
1728                 if (error)
1729                         goto Free;
1730         }
1731         /* Reserve some safe pages for potential later use.
1732          *
1733          * NOTE: This way we make sure there will be enough safe pages for the
1734          * chain_alloc() in get_buffer().  It is a bit wasteful, but
1735          * nr_copy_pages cannot be greater than 50% of the memory anyway.
1736          */
1737         sp_list = NULL;
1738         /* nr_copy_pages cannot be lesser than allocated_unsafe_pages */
1739         nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
1740         nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE);
1741         while (nr_pages > 0) {
1742                 lp = get_image_page(GFP_ATOMIC, PG_SAFE);
1743                 if (!lp) {
1744                         error = -ENOMEM;
1745                         goto Free;
1746                 }
1747                 lp->next = sp_list;
1748                 sp_list = lp;
1749                 nr_pages--;
1750         }
1751         /* Preallocate memory for the image */
1752         safe_pages_list = NULL;
1753         nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
1754         while (nr_pages > 0) {
1755                 lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC);
1756                 if (!lp) {
1757                         error = -ENOMEM;
1758                         goto Free;
1759                 }
1760                 if (!swsusp_page_is_free(virt_to_page(lp))) {
1761                         /* The page is "safe", add it to the list */
1762                         lp->next = safe_pages_list;
1763                         safe_pages_list = lp;
1764                 }
1765                 /* Mark the page as allocated */
1766                 swsusp_set_page_forbidden(virt_to_page(lp));
1767                 swsusp_set_page_free(virt_to_page(lp));
1768                 nr_pages--;
1769         }
1770         /* Free the reserved safe pages so that chain_alloc() can use them */
1771         while (sp_list) {
1772                 lp = sp_list->next;
1773                 free_image_page(sp_list, PG_UNSAFE_CLEAR);
1774                 sp_list = lp;
1775         }
1776         return 0;
1777
1778  Free:
1779         swsusp_free();
1780         return error;
1781 }
1782
1783 /**
1784  *      get_buffer - compute the address that snapshot_write_next() should
1785  *      set for its caller to write to.
1786  */
1787
1788 static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
1789 {
1790         struct pbe *pbe;
1791         struct page *page = pfn_to_page(memory_bm_next_pfn(bm));
1792
1793         if (PageHighMem(page))
1794                 return get_highmem_page_buffer(page, ca);
1795
1796         if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page))
1797                 /* We have allocated the "original" page frame and we can
1798                  * use it directly to store the loaded page.
1799                  */
1800                 return page_address(page);
1801
1802         /* The "original" page frame has not been allocated and we have to
1803          * use a "safe" page frame to store the loaded page.
1804          */
1805         pbe = chain_alloc(ca, sizeof(struct pbe));
1806         if (!pbe) {
1807                 swsusp_free();
1808                 return NULL;
1809         }
1810         pbe->orig_address = page_address(page);
1811         pbe->address = safe_pages_list;
1812         safe_pages_list = safe_pages_list->next;
1813         pbe->next = restore_pblist;
1814         restore_pblist = pbe;
1815         return pbe->address;
1816 }
1817
1818 /**
1819  *      snapshot_write_next - used for writing the system memory snapshot.
1820  *
1821  *      On the first call to it @handle should point to a zeroed
1822  *      snapshot_handle structure.  The structure gets updated and a pointer
1823  *      to it should be passed to this function every next time.
1824  *
1825  *      The @count parameter should contain the number of bytes the caller
1826  *      wants to write to the image.  It must not be zero.
1827  *
1828  *      On success the function returns a positive number.  Then, the caller
1829  *      is allowed to write up to the returned number of bytes to the memory
1830  *      location computed by the data_of() macro.  The number returned
1831  *      may be smaller than @count, but this only happens if the write would
1832  *      cross a page boundary otherwise.
1833  *
1834  *      The function returns 0 to indicate the "end of file" condition,
1835  *      and a negative number is returned on error.  In such cases the
1836  *      structure pointed to by @handle is not updated and should not be used
1837  *      any more.
1838  */
1839
1840 int snapshot_write_next(struct snapshot_handle *handle, size_t count)
1841 {
1842         static struct chain_allocator ca;
1843         int error = 0;
1844
1845         /* Check if we have already loaded the entire image */
1846         if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages)
1847                 return 0;
1848
1849         if (handle->offset == 0) {
1850                 if (!buffer)
1851                         /* This makes the buffer be freed by swsusp_free() */
1852                         buffer = get_image_page(GFP_ATOMIC, PG_ANY);
1853
1854                 if (!buffer)
1855                         return -ENOMEM;
1856
1857                 handle->buffer = buffer;
1858         }
1859         handle->sync_read = 1;
1860         if (handle->prev < handle->cur) {
1861                 if (handle->prev == 0) {
1862                         error = load_header(buffer);
1863                         if (error)
1864                                 return error;
1865
1866                         error = memory_bm_create(&copy_bm, GFP_ATOMIC, PG_ANY);
1867                         if (error)
1868                                 return error;
1869
1870                 } else if (handle->prev <= nr_meta_pages) {
1871                         unpack_orig_pfns(buffer, &copy_bm);
1872                         if (handle->prev == nr_meta_pages) {
1873                                 error = prepare_image(&orig_bm, &copy_bm);
1874                                 if (error)
1875                                         return error;
1876
1877                                 chain_init(&ca, GFP_ATOMIC, PG_SAFE);
1878                                 memory_bm_position_reset(&orig_bm);
1879                                 restore_pblist = NULL;
1880                                 handle->buffer = get_buffer(&orig_bm, &ca);
1881                                 handle->sync_read = 0;
1882                                 if (!handle->buffer)
1883                                         return -ENOMEM;
1884                         }
1885                 } else {
1886                         copy_last_highmem_page();
1887                         handle->buffer = get_buffer(&orig_bm, &ca);
1888                         if (handle->buffer != buffer)
1889                                 handle->sync_read = 0;
1890                 }
1891                 handle->prev = handle->cur;
1892         }
1893         handle->buf_offset = handle->cur_offset;
1894         if (handle->cur_offset + count >= PAGE_SIZE) {
1895                 count = PAGE_SIZE - handle->cur_offset;
1896                 handle->cur_offset = 0;
1897                 handle->cur++;
1898         } else {
1899                 handle->cur_offset += count;
1900         }
1901         handle->offset += count;
1902         return count;
1903 }
1904
1905 /**
1906  *      snapshot_write_finalize - must be called after the last call to
1907  *      snapshot_write_next() in case the last page in the image happens
1908  *      to be a highmem page and its contents should be stored in the
1909  *      highmem.  Additionally, it releases the memory that will not be
1910  *      used any more.
1911  */
1912
1913 void snapshot_write_finalize(struct snapshot_handle *handle)
1914 {
1915         copy_last_highmem_page();
1916         /* Free only if we have loaded the image entirely */
1917         if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) {
1918                 memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
1919                 free_highmem_data();
1920         }
1921 }
1922
1923 int snapshot_image_loaded(struct snapshot_handle *handle)
1924 {
1925         return !(!nr_copy_pages || !last_highmem_page_copied() ||
1926                         handle->cur <= nr_meta_pages + nr_copy_pages);
1927 }
1928
1929 #ifdef CONFIG_HIGHMEM
1930 /* Assumes that @buf is ready and points to a "safe" page */
1931 static inline void
1932 swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
1933 {
1934         void *kaddr1, *kaddr2;
1935
1936         kaddr1 = kmap_atomic(p1, KM_USER0);
1937         kaddr2 = kmap_atomic(p2, KM_USER1);
1938         memcpy(buf, kaddr1, PAGE_SIZE);
1939         memcpy(kaddr1, kaddr2, PAGE_SIZE);
1940         memcpy(kaddr2, buf, PAGE_SIZE);
1941         kunmap_atomic(kaddr1, KM_USER0);
1942         kunmap_atomic(kaddr2, KM_USER1);
1943 }
1944
1945 /**
1946  *      restore_highmem - for each highmem page that was allocated before
1947  *      the suspend and included in the suspend image, and also has been
1948  *      allocated by the "resume" kernel swap its current (ie. "before
1949  *      resume") contents with the previous (ie. "before suspend") one.
1950  *
1951  *      If the resume eventually fails, we can call this function once
1952  *      again and restore the "before resume" highmem state.
1953  */
1954
1955 int restore_highmem(void)
1956 {
1957         struct highmem_pbe *pbe = highmem_pblist;
1958         void *buf;
1959
1960         if (!pbe)
1961                 return 0;
1962
1963         buf = get_image_page(GFP_ATOMIC, PG_SAFE);
1964         if (!buf)
1965                 return -ENOMEM;
1966
1967         while (pbe) {
1968                 swap_two_pages_data(pbe->copy_page, pbe->orig_page, buf);
1969                 pbe = pbe->next;
1970         }
1971         free_image_page(buf, PG_UNSAFE_CLEAR);
1972         return 0;
1973 }
1974 #endif /* CONFIG_HIGHMEM */