[PATCH] swsusp: introduce the swap map structure
[linux-2.6] / kernel / power / swsusp.c
1 /*
2  * linux/kernel/power/swsusp.c
3  *
4  * This file provides code to write suspend image to swap and read it back.
5  *
6  * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
7  * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
8  *
9  * This file is released under the GPLv2.
10  *
11  * I'd like to thank the following people for their work:
12  *
13  * Pavel Machek <pavel@ucw.cz>:
14  * Modifications, defectiveness pointing, being with me at the very beginning,
15  * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17.
16  *
17  * Steve Doddi <dirk@loth.demon.co.uk>:
18  * Support the possibility of hardware state restoring.
19  *
20  * Raph <grey.havens@earthling.net>:
21  * Support for preserving states of network devices and virtual console
22  * (including X and svgatextmode)
23  *
24  * Kurt Garloff <garloff@suse.de>:
25  * Straightened the critical function in order to prevent compilers from
26  * playing tricks with local variables.
27  *
28  * Andreas Mohr <a.mohr@mailto.de>
29  *
30  * Alex Badea <vampire@go.ro>:
31  * Fixed runaway init
32  *
33  * Rafael J. Wysocki <rjw@sisk.pl>
34  * Added the swap map data structure and reworked the handling of swap
35  *
36  * More state savers are welcome. Especially for the scsi layer...
37  *
38  * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
39  */
40
41 #include <linux/module.h>
42 #include <linux/mm.h>
43 #include <linux/suspend.h>
44 #include <linux/smp_lock.h>
45 #include <linux/file.h>
46 #include <linux/utsname.h>
47 #include <linux/version.h>
48 #include <linux/delay.h>
49 #include <linux/bitops.h>
50 #include <linux/spinlock.h>
51 #include <linux/genhd.h>
52 #include <linux/kernel.h>
53 #include <linux/major.h>
54 #include <linux/swap.h>
55 #include <linux/pm.h>
56 #include <linux/device.h>
57 #include <linux/buffer_head.h>
58 #include <linux/swapops.h>
59 #include <linux/bootmem.h>
60 #include <linux/syscalls.h>
61 #include <linux/highmem.h>
62 #include <linux/bio.h>
63
64 #include <asm/uaccess.h>
65 #include <asm/mmu_context.h>
66 #include <asm/pgtable.h>
67 #include <asm/tlbflush.h>
68 #include <asm/io.h>
69
70 #include "power.h"
71
72 #ifdef CONFIG_HIGHMEM
73 int save_highmem(void);
74 int restore_highmem(void);
75 #else
76 static int save_highmem(void) { return 0; }
77 static int restore_highmem(void) { return 0; }
78 #endif
79
80 extern char resume_file[];
81
82 #define SWSUSP_SIG      "S1SUSPEND"
83
84 static struct swsusp_header {
85         char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)];
86         swp_entry_t swsusp_info;
87         char    orig_sig[10];
88         char    sig[10];
89 } __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
90
91 static struct swsusp_info swsusp_info;
92
93 /*
94  * Saving part...
95  */
96
97 /* We memorize in swapfile_used what swap devices are used for suspension */
98 #define SWAPFILE_UNUSED    0
99 #define SWAPFILE_SUSPEND   1    /* This is the suspending device */
100 #define SWAPFILE_IGNORED   2    /* Those are other swap devices ignored for suspension */
101
102 static unsigned short swapfile_used[MAX_SWAPFILES];
103 static unsigned short root_swap;
104
105 static int mark_swapfiles(swp_entry_t prev)
106 {
107         int error;
108
109         rw_swap_page_sync(READ,
110                           swp_entry(root_swap, 0),
111                           virt_to_page((unsigned long)&swsusp_header));
112         if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
113             !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
114                 memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
115                 memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
116                 swsusp_header.swsusp_info = prev;
117                 error = rw_swap_page_sync(WRITE,
118                                           swp_entry(root_swap, 0),
119                                           virt_to_page((unsigned long)
120                                                        &swsusp_header));
121         } else {
122                 pr_debug("swsusp: Partition is not swap space.\n");
123                 error = -ENODEV;
124         }
125         return error;
126 }
127
128 /*
129  * Check whether the swap device is the specified resume
130  * device, irrespective of whether they are specified by
131  * identical names.
132  *
133  * (Thus, device inode aliasing is allowed.  You can say /dev/hda4
134  * instead of /dev/ide/host0/bus0/target0/lun0/part4 [if using devfs]
135  * and they'll be considered the same device.  This is *necessary* for
136  * devfs, since the resume code can only recognize the form /dev/hda4,
137  * but the suspend code would see the long name.)
138  */
139 static int is_resume_device(const struct swap_info_struct *swap_info)
140 {
141         struct file *file = swap_info->swap_file;
142         struct inode *inode = file->f_dentry->d_inode;
143
144         return S_ISBLK(inode->i_mode) &&
145                 swsusp_resume_device == MKDEV(imajor(inode), iminor(inode));
146 }
147
148 static int swsusp_swap_check(void) /* This is called before saving image */
149 {
150         int i, len;
151
152         len=strlen(resume_file);
153         root_swap = 0xFFFF;
154
155         spin_lock(&swap_lock);
156         for (i=0; i<MAX_SWAPFILES; i++) {
157                 if (!(swap_info[i].flags & SWP_WRITEOK)) {
158                         swapfile_used[i]=SWAPFILE_UNUSED;
159                 } else {
160                         if (!len) {
161                                 printk(KERN_WARNING "resume= option should be used to set suspend device" );
162                                 if (root_swap == 0xFFFF) {
163                                         swapfile_used[i] = SWAPFILE_SUSPEND;
164                                         root_swap = i;
165                                 } else
166                                         swapfile_used[i] = SWAPFILE_IGNORED;
167                         } else {
168                                 /* we ignore all swap devices that are not the resume_file */
169                                 if (is_resume_device(&swap_info[i])) {
170                                         swapfile_used[i] = SWAPFILE_SUSPEND;
171                                         root_swap = i;
172                                 } else {
173                                         swapfile_used[i] = SWAPFILE_IGNORED;
174                                 }
175                         }
176                 }
177         }
178         spin_unlock(&swap_lock);
179         return (root_swap != 0xffff) ? 0 : -ENODEV;
180 }
181
182 /**
183  * This is called after saving image so modification
184  * will be lost after resume... and that's what we want.
185  * we make the device unusable. A new call to
186  * lock_swapdevices can unlock the devices.
187  */
188 static void lock_swapdevices(void)
189 {
190         int i;
191
192         spin_lock(&swap_lock);
193         for (i = 0; i< MAX_SWAPFILES; i++)
194                 if (swapfile_used[i] == SWAPFILE_IGNORED) {
195                         swap_info[i].flags ^= SWP_WRITEOK;
196                 }
197         spin_unlock(&swap_lock);
198 }
199
200 /**
201  *      write_page - Write one page to a fresh swap location.
202  *      @addr:  Address we're writing.
203  *      @loc:   Place to store the entry we used.
204  *
205  *      Allocate a new swap entry and 'sync' it. Note we discard -EIO
206  *      errors. That is an artifact left over from swsusp. It did not
207  *      check the return of rw_swap_page_sync() at all, since most pages
208  *      written back to swap would return -EIO.
209  *      This is a partial improvement, since we will at least return other
210  *      errors, though we need to eventually fix the damn code.
211  */
212 static int write_page(unsigned long addr, swp_entry_t *loc)
213 {
214         swp_entry_t entry;
215         int error = 0;
216
217         entry = get_swap_page();
218         if (swp_offset(entry) &&
219             swapfile_used[swp_type(entry)] == SWAPFILE_SUSPEND) {
220                 error = rw_swap_page_sync(WRITE, entry,
221                                           virt_to_page(addr));
222                 if (error == -EIO)
223                         error = 0;
224                 if (!error)
225                         *loc = entry;
226         } else
227                 error = -ENOSPC;
228         return error;
229 }
230
231 /**
232  *      Swap map-handling functions
233  *
234  *      The swap map is a data structure used for keeping track of each page
235  *      written to the swap.  It consists of many swap_map_page structures
236  *      that contain each an array of MAP_PAGE_SIZE swap entries.
237  *      These structures are linked together with the help of either the
238  *      .next (in memory) or the .next_swap (in swap) member.
239  *
240  *      The swap map is created during suspend.  At that time we need to keep
241  *      it in memory, because we have to free all of the allocated swap
242  *      entries if an error occurs.  The memory needed is preallocated
243  *      so that we know in advance if there's enough of it.
244  *
245  *      The first swap_map_page structure is filled with the swap entries that
246  *      correspond to the first MAP_PAGE_SIZE data pages written to swap and
247  *      so on.  After the all of the data pages have been written, the order
248  *      of the swap_map_page structures in the map is reversed so that they
249  *      can be read from swap in the original order.  This causes the data
250  *      pages to be loaded in exactly the same order in which they have been
251  *      saved.
252  *
253  *      During resume we only need to use one swap_map_page structure
254  *      at a time, which means that we only need to use two memory pages for
255  *      reading the image - one for reading the swap_map_page structures
256  *      and the second for reading the data pages from swap.
257  */
258
259 #define MAP_PAGE_SIZE   ((PAGE_SIZE - sizeof(swp_entry_t) - sizeof(void *)) \
260                         / sizeof(swp_entry_t))
261
262 struct swap_map_page {
263         swp_entry_t             entries[MAP_PAGE_SIZE];
264         swp_entry_t             next_swap;
265         struct swap_map_page    *next;
266 };
267
268 static inline void free_swap_map(struct swap_map_page *swap_map)
269 {
270         struct swap_map_page *swp;
271
272         while (swap_map) {
273                 swp = swap_map->next;
274                 free_page((unsigned long)swap_map);
275                 swap_map = swp;
276         }
277 }
278
279 static struct swap_map_page *alloc_swap_map(unsigned int nr_pages)
280 {
281         struct swap_map_page *swap_map, *swp;
282         unsigned n = 0;
283
284         if (!nr_pages)
285                 return NULL;
286
287         pr_debug("alloc_swap_map(): nr_pages = %d\n", nr_pages);
288         swap_map = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
289         swp = swap_map;
290         for (n = MAP_PAGE_SIZE; n < nr_pages; n += MAP_PAGE_SIZE) {
291                 swp->next = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
292                 swp = swp->next;
293                 if (!swp) {
294                         free_swap_map(swap_map);
295                         return NULL;
296                 }
297         }
298         return swap_map;
299 }
300
301 /**
302  *      reverse_swap_map - reverse the order of pages in the swap map
303  *      @swap_map
304  */
305
306 static inline struct swap_map_page *reverse_swap_map(struct swap_map_page *swap_map)
307 {
308         struct swap_map_page *prev, *next;
309
310         prev = NULL;
311         while (swap_map) {
312                 next = swap_map->next;
313                 swap_map->next = prev;
314                 prev = swap_map;
315                 swap_map = next;
316         }
317         return prev;
318 }
319
320 /**
321  *      free_swap_map_entries - free the swap entries allocated to store
322  *      the swap map @swap_map (this is only called in case of an error)
323  */
324 static inline void free_swap_map_entries(struct swap_map_page *swap_map)
325 {
326         while (swap_map) {
327                 if (swap_map->next_swap.val)
328                         swap_free(swap_map->next_swap);
329                 swap_map = swap_map->next;
330         }
331 }
332
333 /**
334  *      save_swap_map - save the swap map used for tracing the data pages
335  *      stored in the swap
336  */
337
338 static int save_swap_map(struct swap_map_page *swap_map, swp_entry_t *start)
339 {
340         swp_entry_t entry = (swp_entry_t){0};
341         int error;
342
343         while (swap_map) {
344                 swap_map->next_swap = entry;
345                 if ((error = write_page((unsigned long)swap_map, &entry)))
346                         return error;
347                 swap_map = swap_map->next;
348         }
349         *start = entry;
350         return 0;
351 }
352
353 /**
354  *      free_image_entries - free the swap entries allocated to store
355  *      the image data pages (this is only called in case of an error)
356  */
357
358 static inline void free_image_entries(struct swap_map_page *swp)
359 {
360         unsigned k;
361
362         while (swp) {
363                 for (k = 0; k < MAP_PAGE_SIZE; k++)
364                         if (swp->entries[k].val)
365                                 swap_free(swp->entries[k]);
366                 swp = swp->next;
367         }
368 }
369
370 /**
371  *      The swap_map_handle structure is used for handling the swap map in
372  *      a file-alike way
373  */
374
375 struct swap_map_handle {
376         struct swap_map_page *cur;
377         unsigned int k;
378 };
379
380 static inline void init_swap_map_handle(struct swap_map_handle *handle,
381                                         struct swap_map_page *map)
382 {
383         handle->cur = map;
384         handle->k = 0;
385 }
386
387 static inline int swap_map_write_page(struct swap_map_handle *handle,
388                                       unsigned long addr)
389 {
390         int error;
391
392         error = write_page(addr, handle->cur->entries + handle->k);
393         if (error)
394                 return error;
395         if (++handle->k >= MAP_PAGE_SIZE) {
396                 handle->cur = handle->cur->next;
397                 handle->k = 0;
398         }
399         return 0;
400 }
401
402 /**
403  *      save_image_data - save the data pages pointed to by the PBEs
404  *      from the list @pblist using the swap map handle @handle
405  *      (assume there are @nr_pages data pages to save)
406  */
407
408 static int save_image_data(struct pbe *pblist,
409                            struct swap_map_handle *handle,
410                            unsigned int nr_pages)
411 {
412         unsigned int m;
413         struct pbe *p;
414         int error = 0;
415
416         printk("Saving image data pages (%u pages) ...     ", nr_pages);
417         m = nr_pages / 100;
418         if (!m)
419                 m = 1;
420         nr_pages = 0;
421         for_each_pbe (p, pblist) {
422                 error = swap_map_write_page(handle, p->address);
423                 if (error)
424                         break;
425                 if (!(nr_pages % m))
426                         printk("\b\b\b\b%3d%%", nr_pages / m);
427                 nr_pages++;
428         }
429         if (!error)
430                 printk("\b\b\b\bdone\n");
431         return error;
432 }
433
434 static void dump_info(void)
435 {
436         pr_debug(" swsusp: Version: %u\n",swsusp_info.version_code);
437         pr_debug(" swsusp: Num Pages: %ld\n",swsusp_info.num_physpages);
438         pr_debug(" swsusp: UTS Sys: %s\n",swsusp_info.uts.sysname);
439         pr_debug(" swsusp: UTS Node: %s\n",swsusp_info.uts.nodename);
440         pr_debug(" swsusp: UTS Release: %s\n",swsusp_info.uts.release);
441         pr_debug(" swsusp: UTS Version: %s\n",swsusp_info.uts.version);
442         pr_debug(" swsusp: UTS Machine: %s\n",swsusp_info.uts.machine);
443         pr_debug(" swsusp: UTS Domain: %s\n",swsusp_info.uts.domainname);
444         pr_debug(" swsusp: CPUs: %d\n",swsusp_info.cpus);
445         pr_debug(" swsusp: Image: %ld Pages\n",swsusp_info.image_pages);
446         pr_debug(" swsusp: Total: %ld Pages\n", swsusp_info.pages);
447 }
448
449 static void init_header(unsigned int nr_pages)
450 {
451         memset(&swsusp_info, 0, sizeof(swsusp_info));
452         swsusp_info.version_code = LINUX_VERSION_CODE;
453         swsusp_info.num_physpages = num_physpages;
454         memcpy(&swsusp_info.uts, &system_utsname, sizeof(system_utsname));
455
456         swsusp_info.cpus = num_online_cpus();
457         swsusp_info.image_pages = nr_pages;
458         swsusp_info.pages = nr_pages +
459                 ((nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT);
460 }
461
462 static int close_swap(void)
463 {
464         swp_entry_t entry;
465         int error;
466
467         dump_info();
468         error = write_page((unsigned long)&swsusp_info, &entry);
469         if (!error) {
470                 printk( "S" );
471                 error = mark_swapfiles(entry);
472                 printk( "|\n" );
473         }
474         return error;
475 }
476
477 /**
478  *      pack_orig_addresses - the .orig_address fields of the PBEs from the
479  *      list starting at @pbe are stored in the array @buf[] (1 page)
480  */
481
482 static inline struct pbe *pack_orig_addresses(unsigned long *buf,
483                                               struct pbe *pbe)
484 {
485         int j;
486
487         for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
488                 buf[j] = pbe->orig_address;
489                 pbe = pbe->next;
490         }
491         if (!pbe)
492                 for (; j < PAGE_SIZE / sizeof(long); j++)
493                         buf[j] = 0;
494         return pbe;
495 }
496
497 /**
498  *      save_image_metadata - save the .orig_address fields of the PBEs
499  *      from the list @pblist using the swap map handle @handle
500  */
501
502 static int save_image_metadata(struct pbe *pblist,
503                                struct swap_map_handle *handle)
504 {
505         unsigned long *buf;
506         unsigned int n = 0;
507         struct pbe *p;
508         int error = 0;
509
510         printk("Saving image metadata ... ");
511         buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC);
512         if (!buf)
513                 return -ENOMEM;
514         p = pblist;
515         while (p) {
516                 p = pack_orig_addresses(buf, p);
517                 error = swap_map_write_page(handle, (unsigned long)buf);
518                 if (error)
519                         break;
520                 n++;
521         }
522         free_page((unsigned long)buf);
523         if (!error)
524                 printk("done (%u pages saved)\n", n);
525         return error;
526 }
527
528 /**
529  *      enough_swap - Make sure we have enough swap to save the image.
530  *
531  *      Returns TRUE or FALSE after checking the total amount of swap
532  *      space avaiable.
533  *
534  *      FIXME: si_swapinfo(&i) returns all swap devices information.
535  *      We should only consider resume_device.
536  */
537
538 static int enough_swap(unsigned int nr_pages)
539 {
540         struct sysinfo i;
541
542         si_swapinfo(&i);
543         pr_debug("swsusp: available swap: %lu pages\n", i.freeswap);
544         return i.freeswap > (nr_pages + PAGES_FOR_IO +
545                 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
546 }
547
548 /**
549  *      write_suspend_image - Write entire image and metadata.
550  */
551 static int write_suspend_image(struct pbe *pblist, unsigned int nr_pages)
552 {
553         struct swap_map_page *swap_map;
554         struct swap_map_handle handle;
555         int error;
556
557         if (!enough_swap(nr_pages)) {
558                 printk(KERN_ERR "swsusp: Not enough free swap\n");
559                 return -ENOSPC;
560         }
561
562         init_header(nr_pages);
563         swap_map = alloc_swap_map(swsusp_info.pages);
564         if (!swap_map)
565                 return -ENOMEM;
566         init_swap_map_handle(&handle, swap_map);
567
568         error = save_image_metadata(pblist, &handle);
569         if (!error)
570                 error = save_image_data(pblist, &handle, nr_pages);
571         if (error)
572                 goto Free_image_entries;
573
574         swap_map = reverse_swap_map(swap_map);
575         error = save_swap_map(swap_map, &swsusp_info.start);
576         if (error)
577                 goto Free_map_entries;
578
579         error = close_swap();
580         if (error)
581                 goto Free_map_entries;
582
583 Free_swap_map:
584         free_swap_map(swap_map);
585         return error;
586
587 Free_map_entries:
588         free_swap_map_entries(swap_map);
589 Free_image_entries:
590         free_image_entries(swap_map);
591         goto Free_swap_map;
592 }
593
594 /* It is important _NOT_ to umount filesystems at this point. We want
595  * them synced (in case something goes wrong) but we DO not want to mark
596  * filesystem clean: it is not. (And it does not matter, if we resume
597  * correctly, we'll mark system clean, anyway.)
598  */
599 int swsusp_write(struct pbe *pblist, unsigned int nr_pages)
600 {
601         int error;
602
603         if ((error = swsusp_swap_check())) {
604                 printk(KERN_ERR "swsusp: cannot find swap device, try swapon -a.\n");
605                 return error;
606         }
607         lock_swapdevices();
608         error = write_suspend_image(pblist, nr_pages);
609         /* This will unlock ignored swap devices since writing is finished */
610         lock_swapdevices();
611         return error;
612 }
613
614 int swsusp_suspend(void)
615 {
616         int error;
617
618         if ((error = arch_prepare_suspend()))
619                 return error;
620         local_irq_disable();
621         /* At this point, device_suspend() has been called, but *not*
622          * device_power_down(). We *must* device_power_down() now.
623          * Otherwise, drivers for some devices (e.g. interrupt controllers)
624          * become desynchronized with the actual state of the hardware
625          * at resume time, and evil weirdness ensues.
626          */
627         if ((error = device_power_down(PMSG_FREEZE))) {
628                 printk(KERN_ERR "Some devices failed to power down, aborting suspend\n");
629                 goto Enable_irqs;
630         }
631
632         if ((error = save_highmem())) {
633                 printk(KERN_ERR "swsusp: Not enough free pages for highmem\n");
634                 goto Restore_highmem;
635         }
636
637         save_processor_state();
638         if ((error = swsusp_arch_suspend()))
639                 printk(KERN_ERR "Error %d suspending\n", error);
640         /* Restore control flow magically appears here */
641         restore_processor_state();
642 Restore_highmem:
643         restore_highmem();
644         device_power_up();
645 Enable_irqs:
646         local_irq_enable();
647         return error;
648 }
649
650 int swsusp_resume(void)
651 {
652         int error;
653         local_irq_disable();
654         if (device_power_down(PMSG_FREEZE))
655                 printk(KERN_ERR "Some devices failed to power down, very bad\n");
656         /* We'll ignore saved state, but this gets preempt count (etc) right */
657         save_processor_state();
658         error = swsusp_arch_resume();
659         /* Code below is only ever reached in case of failure. Otherwise
660          * execution continues at place where swsusp_arch_suspend was called
661          */
662         BUG_ON(!error);
663         /* The only reason why swsusp_arch_resume() can fail is memory being
664          * very tight, so we have to free it as soon as we can to avoid
665          * subsequent failures
666          */
667         swsusp_free();
668         restore_processor_state();
669         restore_highmem();
670         touch_softlockup_watchdog();
671         device_power_up();
672         local_irq_enable();
673         return error;
674 }
675
676 /**
677  *      mark_unsafe_pages - mark the pages that cannot be used for storing
678  *      the image during resume, because they conflict with the pages that
679  *      had been used before suspend
680  */
681
682 static void mark_unsafe_pages(struct pbe *pblist)
683 {
684         struct zone *zone;
685         unsigned long zone_pfn;
686         struct pbe *p;
687
688         if (!pblist) /* a sanity check */
689                 return;
690
691         /* Clear page flags */
692         for_each_zone (zone) {
693                 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
694                         if (pfn_valid(zone_pfn + zone->zone_start_pfn))
695                                 ClearPageNosaveFree(pfn_to_page(zone_pfn +
696                                         zone->zone_start_pfn));
697         }
698
699         /* Mark orig addresses */
700         for_each_pbe (p, pblist)
701                 SetPageNosaveFree(virt_to_page(p->orig_address));
702
703 }
704
705 static void copy_page_backup_list(struct pbe *dst, struct pbe *src)
706 {
707         /* We assume both lists contain the same number of elements */
708         while (src) {
709                 dst->orig_address = src->orig_address;
710                 dst = dst->next;
711                 src = src->next;
712         }
713 }
714
715 /*
716  *      Using bio to read from swap.
717  *      This code requires a bit more work than just using buffer heads
718  *      but, it is the recommended way for 2.5/2.6.
719  *      The following are to signal the beginning and end of I/O. Bios
720  *      finish asynchronously, while we want them to happen synchronously.
721  *      A simple atomic_t, and a wait loop take care of this problem.
722  */
723
724 static atomic_t io_done = ATOMIC_INIT(0);
725
726 static int end_io(struct bio *bio, unsigned int num, int err)
727 {
728         if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
729                 panic("I/O error reading memory image");
730         atomic_set(&io_done, 0);
731         return 0;
732 }
733
734 static struct block_device *resume_bdev;
735
736 /**
737  *      submit - submit BIO request.
738  *      @rw:    READ or WRITE.
739  *      @off    physical offset of page.
740  *      @page:  page we're reading or writing.
741  *
742  *      Straight from the textbook - allocate and initialize the bio.
743  *      If we're writing, make sure the page is marked as dirty.
744  *      Then submit it and wait.
745  */
746
747 static int submit(int rw, pgoff_t page_off, void *page)
748 {
749         int error = 0;
750         struct bio *bio;
751
752         bio = bio_alloc(GFP_ATOMIC, 1);
753         if (!bio)
754                 return -ENOMEM;
755         bio->bi_sector = page_off * (PAGE_SIZE >> 9);
756         bio_get(bio);
757         bio->bi_bdev = resume_bdev;
758         bio->bi_end_io = end_io;
759
760         if (bio_add_page(bio, virt_to_page(page), PAGE_SIZE, 0) < PAGE_SIZE) {
761                 printk("swsusp: ERROR: adding page to bio at %ld\n",page_off);
762                 error = -EFAULT;
763                 goto Done;
764         }
765
766         if (rw == WRITE)
767                 bio_set_pages_dirty(bio);
768
769         atomic_set(&io_done, 1);
770         submit_bio(rw | (1 << BIO_RW_SYNC), bio);
771         while (atomic_read(&io_done))
772                 yield();
773
774  Done:
775         bio_put(bio);
776         return error;
777 }
778
779 static int bio_read_page(pgoff_t page_off, void *page)
780 {
781         return submit(READ, page_off, page);
782 }
783
784 static int bio_write_page(pgoff_t page_off, void *page)
785 {
786         return submit(WRITE, page_off, page);
787 }
788
789 /**
790  *      The following functions allow us to read data using a swap map
791  *      in a file-alike way
792  */
793
794 static inline void release_swap_map_reader(struct swap_map_handle *handle)
795 {
796         if (handle->cur)
797                 free_page((unsigned long)handle->cur);
798         handle->cur = NULL;
799 }
800
801 static inline int get_swap_map_reader(struct swap_map_handle *handle,
802                                       swp_entry_t start)
803 {
804         int error;
805
806         if (!swp_offset(start))
807                 return -EINVAL;
808         handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
809         if (!handle->cur)
810                 return -ENOMEM;
811         error = bio_read_page(swp_offset(start), handle->cur);
812         if (error) {
813                 release_swap_map_reader(handle);
814                 return error;
815         }
816         handle->k = 0;
817         return 0;
818 }
819
820 static inline int swap_map_read_page(struct swap_map_handle *handle, void *buf)
821 {
822         unsigned long offset;
823         int error;
824
825         if (!handle->cur)
826                 return -EINVAL;
827         offset = swp_offset(handle->cur->entries[handle->k]);
828         if (!offset)
829                 return -EINVAL;
830         error = bio_read_page(offset, buf);
831         if (error)
832                 return error;
833         if (++handle->k >= MAP_PAGE_SIZE) {
834                 handle->k = 0;
835                 offset = swp_offset(handle->cur->next_swap);
836                 if (!offset)
837                         release_swap_map_reader(handle);
838                 else
839                         error = bio_read_page(offset, handle->cur);
840         }
841         return error;
842 }
843
844 /*
845  * Sanity check if this image makes sense with this kernel/swap context
846  * I really don't think that it's foolproof but more than nothing..
847  */
848
849 static const char *sanity_check(void)
850 {
851         dump_info();
852         if (swsusp_info.version_code != LINUX_VERSION_CODE)
853                 return "kernel version";
854         if (swsusp_info.num_physpages != num_physpages)
855                 return "memory size";
856         if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname))
857                 return "system type";
858         if (strcmp(swsusp_info.uts.release,system_utsname.release))
859                 return "kernel release";
860         if (strcmp(swsusp_info.uts.version,system_utsname.version))
861                 return "version";
862         if (strcmp(swsusp_info.uts.machine,system_utsname.machine))
863                 return "machine";
864 #if 0
865         /* We can't use number of online CPUs when we use hotplug to remove them ;-))) */
866         if (swsusp_info.cpus != num_possible_cpus())
867                 return "number of cpus";
868 #endif
869         return NULL;
870 }
871
872 static int check_header(void)
873 {
874         const char *reason = NULL;
875         int error;
876
877         if ((error = bio_read_page(swp_offset(swsusp_header.swsusp_info), &swsusp_info)))
878                 return error;
879
880         /* Is this same machine? */
881         if ((reason = sanity_check())) {
882                 printk(KERN_ERR "swsusp: Resume mismatch: %s\n",reason);
883                 return -EPERM;
884         }
885         return error;
886 }
887
888 static int check_sig(void)
889 {
890         int error;
891
892         memset(&swsusp_header, 0, sizeof(swsusp_header));
893         if ((error = bio_read_page(0, &swsusp_header)))
894                 return error;
895         if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
896                 memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
897
898                 /*
899                  * Reset swap signature now.
900                  */
901                 error = bio_write_page(0, &swsusp_header);
902         } else {
903                 return -EINVAL;
904         }
905         if (!error)
906                 pr_debug("swsusp: Signature found, resuming\n");
907         return error;
908 }
909
910 /**
911  *      load_image_data - load the image data using the swap map handle
912  *      @handle and store them using the page backup list @pblist
913  *      (assume there are @nr_pages pages to load)
914  */
915
916 static int load_image_data(struct pbe *pblist,
917                            struct swap_map_handle *handle,
918                            unsigned int nr_pages)
919 {
920         int error;
921         unsigned int m;
922         struct pbe *p;
923
924         if (!pblist)
925                 return -EINVAL;
926         printk("Loading image data pages (%u pages) ...     ", nr_pages);
927         m = nr_pages / 100;
928         if (!m)
929                 m = 1;
930         nr_pages = 0;
931         p = pblist;
932         while (p) {
933                 error = swap_map_read_page(handle, (void *)p->address);
934                 if (error)
935                         break;
936                 p = p->next;
937                 if (!(nr_pages % m))
938                         printk("\b\b\b\b%3d%%", nr_pages / m);
939                 nr_pages++;
940         }
941         if (!error)
942                 printk("\b\b\b\bdone\n");
943         return error;
944 }
945
946 /**
947  *      unpack_orig_addresses - copy the elements of @buf[] (1 page) to
948  *      the PBEs in the list starting at @pbe
949  */
950
951 static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
952                                                 struct pbe *pbe)
953 {
954         int j;
955
956         for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
957                 pbe->orig_address = buf[j];
958                 pbe = pbe->next;
959         }
960         return pbe;
961 }
962
963 /**
964  *      load_image_metadata - load the image metadata using the swap map
965  *      handle @handle and put them into the PBEs in the list @pblist
966  */
967
968 static int load_image_metadata(struct pbe *pblist, struct swap_map_handle *handle)
969 {
970         struct pbe *p;
971         unsigned long *buf;
972         unsigned int n = 0;
973         int error = 0;
974
975         printk("Loading image metadata ... ");
976         buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC);
977         if (!buf)
978                 return -ENOMEM;
979         p = pblist;
980         while (p) {
981                 error = swap_map_read_page(handle, buf);
982                 if (error)
983                         break;
984                 p = unpack_orig_addresses(buf, p);
985                 n++;
986         }
987         free_page((unsigned long)buf);
988         if (!error)
989                 printk("done (%u pages loaded)\n", n);
990         return error;
991 }
992
993 static int check_suspend_image(void)
994 {
995         int error = 0;
996
997         if ((error = check_sig()))
998                 return error;
999
1000         if ((error = check_header()))
1001                 return error;
1002
1003         return 0;
1004 }
1005
1006 static int read_suspend_image(struct pbe **pblist_ptr)
1007 {
1008         int error = 0;
1009         struct pbe *p, *pblist;
1010         struct swap_map_handle handle;
1011         unsigned int nr_pages = swsusp_info.image_pages;
1012
1013         p = alloc_pagedir(nr_pages, GFP_ATOMIC, 0);
1014         if (!p)
1015                 return -ENOMEM;
1016         error = get_swap_map_reader(&handle, swsusp_info.start);
1017         if (error)
1018                 /* The PBE list at p will be released by swsusp_free() */
1019                 return error;
1020         error = load_image_metadata(p, &handle);
1021         if (!error) {
1022                 mark_unsafe_pages(p);
1023                 pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1);
1024                 if (pblist)
1025                         copy_page_backup_list(pblist, p);
1026                 free_pagedir(p);
1027                 if (!pblist)
1028                         error = -ENOMEM;
1029
1030                 /* Allocate memory for the image and read the data from swap */
1031                 if (!error)
1032                         error = alloc_data_pages(pblist, GFP_ATOMIC, 1);
1033                 if (!error)
1034                         error = load_image_data(pblist, &handle, nr_pages);
1035                 if (!error)
1036                         *pblist_ptr = pblist;
1037         }
1038         release_swap_map_reader(&handle);
1039         return error;
1040 }
1041
1042 /**
1043  *      swsusp_check - Check for saved image in swap
1044  */
1045
1046 int swsusp_check(void)
1047 {
1048         int error;
1049
1050         resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
1051         if (!IS_ERR(resume_bdev)) {
1052                 set_blocksize(resume_bdev, PAGE_SIZE);
1053                 error = check_suspend_image();
1054                 if (error)
1055                     blkdev_put(resume_bdev);
1056         } else
1057                 error = PTR_ERR(resume_bdev);
1058
1059         if (!error)
1060                 pr_debug("swsusp: resume file found\n");
1061         else
1062                 pr_debug("swsusp: Error %d check for resume file\n", error);
1063         return error;
1064 }
1065
1066 /**
1067  *      swsusp_read - Read saved image from swap.
1068  */
1069
1070 int swsusp_read(struct pbe **pblist_ptr)
1071 {
1072         int error;
1073
1074         if (IS_ERR(resume_bdev)) {
1075                 pr_debug("swsusp: block device not initialised\n");
1076                 return PTR_ERR(resume_bdev);
1077         }
1078
1079         error = read_suspend_image(pblist_ptr);
1080         blkdev_put(resume_bdev);
1081
1082         if (!error)
1083                 pr_debug("swsusp: Reading resume file was successful\n");
1084         else
1085                 pr_debug("swsusp: Error %d resuming\n", error);
1086         return error;
1087 }
1088
1089 /**
1090  *      swsusp_close - close swap device.
1091  */
1092
1093 void swsusp_close(void)
1094 {
1095         if (IS_ERR(resume_bdev)) {
1096                 pr_debug("swsusp: block device not initialised\n");
1097                 return;
1098         }
1099
1100         blkdev_put(resume_bdev);
1101 }