Merge master.kernel.org:/pub/scm/linux/kernel/git/jejb/scsi-misc-2.6
[linux-2.6] / fs / reiserfs / fix_node.c
1 /*
2  * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3  */
4
5 /**
6  ** old_item_num
7  ** old_entry_num
8  ** set_entry_sizes
9  ** create_virtual_node
10  ** check_left
11  ** check_right
12  ** directory_part_size
13  ** get_num_ver
14  ** set_parameters
15  ** is_leaf_removable
16  ** are_leaves_removable
17  ** get_empty_nodes
18  ** get_lfree
19  ** get_rfree
20  ** is_left_neighbor_in_cache
21  ** decrement_key
22  ** get_far_parent
23  ** get_parents
24  ** can_node_be_removed
25  ** ip_check_balance
26  ** dc_check_balance_internal
27  ** dc_check_balance_leaf
28  ** dc_check_balance
29  ** check_balance
30  ** get_direct_parent
31  ** get_neighbors
32  ** fix_nodes
33  ** 
34  ** 
35  **/
36
37 #include <linux/time.h>
38 #include <linux/string.h>
39 #include <linux/reiserfs_fs.h>
40 #include <linux/buffer_head.h>
41
42 /* To make any changes in the tree we find a node, that contains item
43    to be changed/deleted or position in the node we insert a new item
44    to. We call this node S. To do balancing we need to decide what we
45    will shift to left/right neighbor, or to a new node, where new item
46    will be etc. To make this analysis simpler we build virtual
47    node. Virtual node is an array of items, that will replace items of
48    node S. (For instance if we are going to delete an item, virtual
49    node does not contain it). Virtual node keeps information about
50    item sizes and types, mergeability of first and last items, sizes
51    of all entries in directory item. We use this array of items when
52    calculating what we can shift to neighbors and how many nodes we
53    have to have if we do not any shiftings, if we shift to left/right
54    neighbor or to both. */
55
56 /* taking item number in virtual node, returns number of item, that it has in source buffer */
57 static inline int old_item_num(int new_num, int affected_item_num, int mode)
58 {
59         if (mode == M_PASTE || mode == M_CUT || new_num < affected_item_num)
60                 return new_num;
61
62         if (mode == M_INSERT) {
63
64                 RFALSE(new_num == 0,
65                        "vs-8005: for INSERT mode and item number of inserted item");
66
67                 return new_num - 1;
68         }
69
70         RFALSE(mode != M_DELETE,
71                "vs-8010: old_item_num: mode must be M_DELETE (mode = \'%c\'",
72                mode);
73         /* delete mode */
74         return new_num + 1;
75 }
76
77 static void create_virtual_node(struct tree_balance *tb, int h)
78 {
79         struct item_head *ih;
80         struct virtual_node *vn = tb->tb_vn;
81         int new_num;
82         struct buffer_head *Sh; /* this comes from tb->S[h] */
83
84         Sh = PATH_H_PBUFFER(tb->tb_path, h);
85
86         /* size of changed node */
87         vn->vn_size =
88             MAX_CHILD_SIZE(Sh) - B_FREE_SPACE(Sh) + tb->insert_size[h];
89
90         /* for internal nodes array if virtual items is not created */
91         if (h) {
92                 vn->vn_nr_item = (vn->vn_size - DC_SIZE) / (DC_SIZE + KEY_SIZE);
93                 return;
94         }
95
96         /* number of items in virtual node  */
97         vn->vn_nr_item =
98             B_NR_ITEMS(Sh) + ((vn->vn_mode == M_INSERT) ? 1 : 0) -
99             ((vn->vn_mode == M_DELETE) ? 1 : 0);
100
101         /* first virtual item */
102         vn->vn_vi = (struct virtual_item *)(tb->tb_vn + 1);
103         memset(vn->vn_vi, 0, vn->vn_nr_item * sizeof(struct virtual_item));
104         vn->vn_free_ptr += vn->vn_nr_item * sizeof(struct virtual_item);
105
106         /* first item in the node */
107         ih = B_N_PITEM_HEAD(Sh, 0);
108
109         /* define the mergeability for 0-th item (if it is not being deleted) */
110         if (op_is_left_mergeable(&(ih->ih_key), Sh->b_size)
111             && (vn->vn_mode != M_DELETE || vn->vn_affected_item_num))
112                 vn->vn_vi[0].vi_type |= VI_TYPE_LEFT_MERGEABLE;
113
114         /* go through all items those remain in the virtual node (except for the new (inserted) one) */
115         for (new_num = 0; new_num < vn->vn_nr_item; new_num++) {
116                 int j;
117                 struct virtual_item *vi = vn->vn_vi + new_num;
118                 int is_affected =
119                     ((new_num != vn->vn_affected_item_num) ? 0 : 1);
120
121                 if (is_affected && vn->vn_mode == M_INSERT)
122                         continue;
123
124                 /* get item number in source node */
125                 j = old_item_num(new_num, vn->vn_affected_item_num,
126                                  vn->vn_mode);
127
128                 vi->vi_item_len += ih_item_len(ih + j) + IH_SIZE;
129                 vi->vi_ih = ih + j;
130                 vi->vi_item = B_I_PITEM(Sh, ih + j);
131                 vi->vi_uarea = vn->vn_free_ptr;
132
133                 // FIXME: there is no check, that item operation did not
134                 // consume too much memory
135                 vn->vn_free_ptr +=
136                     op_create_vi(vn, vi, is_affected, tb->insert_size[0]);
137                 if (tb->vn_buf + tb->vn_buf_size < vn->vn_free_ptr)
138                         reiserfs_panic(tb->tb_sb,
139                                        "vs-8030: create_virtual_node: "
140                                        "virtual node space consumed");
141
142                 if (!is_affected)
143                         /* this is not being changed */
144                         continue;
145
146                 if (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT) {
147                         vn->vn_vi[new_num].vi_item_len += tb->insert_size[0];
148                         vi->vi_new_data = vn->vn_data;  // pointer to data which is going to be pasted
149                 }
150         }
151
152         /* virtual inserted item is not defined yet */
153         if (vn->vn_mode == M_INSERT) {
154                 struct virtual_item *vi = vn->vn_vi + vn->vn_affected_item_num;
155
156                 RFALSE(vn->vn_ins_ih == 0,
157                        "vs-8040: item header of inserted item is not specified");
158                 vi->vi_item_len = tb->insert_size[0];
159                 vi->vi_ih = vn->vn_ins_ih;
160                 vi->vi_item = vn->vn_data;
161                 vi->vi_uarea = vn->vn_free_ptr;
162
163                 op_create_vi(vn, vi, 0 /*not pasted or cut */ ,
164                              tb->insert_size[0]);
165         }
166
167         /* set right merge flag we take right delimiting key and check whether it is a mergeable item */
168         if (tb->CFR[0]) {
169                 struct reiserfs_key *key;
170
171                 key = B_N_PDELIM_KEY(tb->CFR[0], tb->rkey[0]);
172                 if (op_is_left_mergeable(key, Sh->b_size)
173                     && (vn->vn_mode != M_DELETE
174                         || vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1))
175                         vn->vn_vi[vn->vn_nr_item - 1].vi_type |=
176                             VI_TYPE_RIGHT_MERGEABLE;
177
178 #ifdef CONFIG_REISERFS_CHECK
179                 if (op_is_left_mergeable(key, Sh->b_size) &&
180                     !(vn->vn_mode != M_DELETE
181                       || vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1)) {
182                         /* we delete last item and it could be merged with right neighbor's first item */
183                         if (!
184                             (B_NR_ITEMS(Sh) == 1
185                              && is_direntry_le_ih(B_N_PITEM_HEAD(Sh, 0))
186                              && I_ENTRY_COUNT(B_N_PITEM_HEAD(Sh, 0)) == 1)) {
187                                 /* node contains more than 1 item, or item is not directory item, or this item contains more than 1 entry */
188                                 print_block(Sh, 0, -1, -1);
189                                 reiserfs_panic(tb->tb_sb,
190                                                "vs-8045: create_virtual_node: rdkey %k, affected item==%d (mode==%c) Must be %c",
191                                                key, vn->vn_affected_item_num,
192                                                vn->vn_mode, M_DELETE);
193                         }
194                 }
195 #endif
196
197         }
198 }
199
200 /* using virtual node check, how many items can be shifted to left
201    neighbor */
202 static void check_left(struct tree_balance *tb, int h, int cur_free)
203 {
204         int i;
205         struct virtual_node *vn = tb->tb_vn;
206         struct virtual_item *vi;
207         int d_size, ih_size;
208
209         RFALSE(cur_free < 0, "vs-8050: cur_free (%d) < 0", cur_free);
210
211         /* internal level */
212         if (h > 0) {
213                 tb->lnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
214                 return;
215         }
216
217         /* leaf level */
218
219         if (!cur_free || !vn->vn_nr_item) {
220                 /* no free space or nothing to move */
221                 tb->lnum[h] = 0;
222                 tb->lbytes = -1;
223                 return;
224         }
225
226         RFALSE(!PATH_H_PPARENT(tb->tb_path, 0),
227                "vs-8055: parent does not exist or invalid");
228
229         vi = vn->vn_vi;
230         if ((unsigned int)cur_free >=
231             (vn->vn_size -
232              ((vi->vi_type & VI_TYPE_LEFT_MERGEABLE) ? IH_SIZE : 0))) {
233                 /* all contents of S[0] fits into L[0] */
234
235                 RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
236                        "vs-8055: invalid mode or balance condition failed");
237
238                 tb->lnum[0] = vn->vn_nr_item;
239                 tb->lbytes = -1;
240                 return;
241         }
242
243         d_size = 0, ih_size = IH_SIZE;
244
245         /* first item may be merge with last item in left neighbor */
246         if (vi->vi_type & VI_TYPE_LEFT_MERGEABLE)
247                 d_size = -((int)IH_SIZE), ih_size = 0;
248
249         tb->lnum[0] = 0;
250         for (i = 0; i < vn->vn_nr_item;
251              i++, ih_size = IH_SIZE, d_size = 0, vi++) {
252                 d_size += vi->vi_item_len;
253                 if (cur_free >= d_size) {
254                         /* the item can be shifted entirely */
255                         cur_free -= d_size;
256                         tb->lnum[0]++;
257                         continue;
258                 }
259
260                 /* the item cannot be shifted entirely, try to split it */
261                 /* check whether L[0] can hold ih and at least one byte of the item body */
262                 if (cur_free <= ih_size) {
263                         /* cannot shift even a part of the current item */
264                         tb->lbytes = -1;
265                         return;
266                 }
267                 cur_free -= ih_size;
268
269                 tb->lbytes = op_check_left(vi, cur_free, 0, 0);
270                 if (tb->lbytes != -1)
271                         /* count partially shifted item */
272                         tb->lnum[0]++;
273
274                 break;
275         }
276
277         return;
278 }
279
280 /* using virtual node check, how many items can be shifted to right
281    neighbor */
282 static void check_right(struct tree_balance *tb, int h, int cur_free)
283 {
284         int i;
285         struct virtual_node *vn = tb->tb_vn;
286         struct virtual_item *vi;
287         int d_size, ih_size;
288
289         RFALSE(cur_free < 0, "vs-8070: cur_free < 0");
290
291         /* internal level */
292         if (h > 0) {
293                 tb->rnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
294                 return;
295         }
296
297         /* leaf level */
298
299         if (!cur_free || !vn->vn_nr_item) {
300                 /* no free space  */
301                 tb->rnum[h] = 0;
302                 tb->rbytes = -1;
303                 return;
304         }
305
306         RFALSE(!PATH_H_PPARENT(tb->tb_path, 0),
307                "vs-8075: parent does not exist or invalid");
308
309         vi = vn->vn_vi + vn->vn_nr_item - 1;
310         if ((unsigned int)cur_free >=
311             (vn->vn_size -
312              ((vi->vi_type & VI_TYPE_RIGHT_MERGEABLE) ? IH_SIZE : 0))) {
313                 /* all contents of S[0] fits into R[0] */
314
315                 RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
316                        "vs-8080: invalid mode or balance condition failed");
317
318                 tb->rnum[h] = vn->vn_nr_item;
319                 tb->rbytes = -1;
320                 return;
321         }
322
323         d_size = 0, ih_size = IH_SIZE;
324
325         /* last item may be merge with first item in right neighbor */
326         if (vi->vi_type & VI_TYPE_RIGHT_MERGEABLE)
327                 d_size = -(int)IH_SIZE, ih_size = 0;
328
329         tb->rnum[0] = 0;
330         for (i = vn->vn_nr_item - 1; i >= 0;
331              i--, d_size = 0, ih_size = IH_SIZE, vi--) {
332                 d_size += vi->vi_item_len;
333                 if (cur_free >= d_size) {
334                         /* the item can be shifted entirely */
335                         cur_free -= d_size;
336                         tb->rnum[0]++;
337                         continue;
338                 }
339
340                 /* check whether R[0] can hold ih and at least one byte of the item body */
341                 if (cur_free <= ih_size) {      /* cannot shift even a part of the current item */
342                         tb->rbytes = -1;
343                         return;
344                 }
345
346                 /* R[0] can hold the header of the item and at least one byte of its body */
347                 cur_free -= ih_size;    /* cur_free is still > 0 */
348
349                 tb->rbytes = op_check_right(vi, cur_free);
350                 if (tb->rbytes != -1)
351                         /* count partially shifted item */
352                         tb->rnum[0]++;
353
354                 break;
355         }
356
357         return;
358 }
359
360 /*
361  * from - number of items, which are shifted to left neighbor entirely
362  * to - number of item, which are shifted to right neighbor entirely
363  * from_bytes - number of bytes of boundary item (or directory entries) which are shifted to left neighbor
364  * to_bytes - number of bytes of boundary item (or directory entries) which are shifted to right neighbor */
365 static int get_num_ver(int mode, struct tree_balance *tb, int h,
366                        int from, int from_bytes,
367                        int to, int to_bytes, short *snum012, int flow)
368 {
369         int i;
370         int cur_free;
371         //    int bytes;
372         int units;
373         struct virtual_node *vn = tb->tb_vn;
374         //    struct virtual_item * vi;
375
376         int total_node_size, max_node_size, current_item_size;
377         int needed_nodes;
378         int start_item,         /* position of item we start filling node from */
379          end_item,              /* position of item we finish filling node by */
380          start_bytes,           /* number of first bytes (entries for directory) of start_item-th item 
381                                    we do not include into node that is being filled */
382          end_bytes;             /* number of last bytes (entries for directory) of end_item-th item 
383                                    we do node include into node that is being filled */
384         int split_item_positions[2];    /* these are positions in virtual item of
385                                            items, that are split between S[0] and
386                                            S1new and S1new and S2new */
387
388         split_item_positions[0] = -1;
389         split_item_positions[1] = -1;
390
391         /* We only create additional nodes if we are in insert or paste mode
392            or we are in replace mode at the internal level. If h is 0 and
393            the mode is M_REPLACE then in fix_nodes we change the mode to
394            paste or insert before we get here in the code.  */
395         RFALSE(tb->insert_size[h] < 0 || (mode != M_INSERT && mode != M_PASTE),
396                "vs-8100: insert_size < 0 in overflow");
397
398         max_node_size = MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, h));
399
400         /* snum012 [0-2] - number of items, that lay
401            to S[0], first new node and second new node */
402         snum012[3] = -1;        /* s1bytes */
403         snum012[4] = -1;        /* s2bytes */
404
405         /* internal level */
406         if (h > 0) {
407                 i = ((to - from) * (KEY_SIZE + DC_SIZE) + DC_SIZE);
408                 if (i == max_node_size)
409                         return 1;
410                 return (i / max_node_size + 1);
411         }
412
413         /* leaf level */
414         needed_nodes = 1;
415         total_node_size = 0;
416         cur_free = max_node_size;
417
418         // start from 'from'-th item
419         start_item = from;
420         // skip its first 'start_bytes' units
421         start_bytes = ((from_bytes != -1) ? from_bytes : 0);
422
423         // last included item is the 'end_item'-th one
424         end_item = vn->vn_nr_item - to - 1;
425         // do not count last 'end_bytes' units of 'end_item'-th item
426         end_bytes = (to_bytes != -1) ? to_bytes : 0;
427
428         /* go through all item beginning from the start_item-th item and ending by
429            the end_item-th item. Do not count first 'start_bytes' units of
430            'start_item'-th item and last 'end_bytes' of 'end_item'-th item */
431
432         for (i = start_item; i <= end_item; i++) {
433                 struct virtual_item *vi = vn->vn_vi + i;
434                 int skip_from_end = ((i == end_item) ? end_bytes : 0);
435
436                 RFALSE(needed_nodes > 3, "vs-8105: too many nodes are needed");
437
438                 /* get size of current item */
439                 current_item_size = vi->vi_item_len;
440
441                 /* do not take in calculation head part (from_bytes) of from-th item */
442                 current_item_size -=
443                     op_part_size(vi, 0 /*from start */ , start_bytes);
444
445                 /* do not take in calculation tail part of last item */
446                 current_item_size -=
447                     op_part_size(vi, 1 /*from end */ , skip_from_end);
448
449                 /* if item fits into current node entierly */
450                 if (total_node_size + current_item_size <= max_node_size) {
451                         snum012[needed_nodes - 1]++;
452                         total_node_size += current_item_size;
453                         start_bytes = 0;
454                         continue;
455                 }
456
457                 if (current_item_size > max_node_size) {
458                         /* virtual item length is longer, than max size of item in
459                            a node. It is impossible for direct item */
460                         RFALSE(is_direct_le_ih(vi->vi_ih),
461                                "vs-8110: "
462                                "direct item length is %d. It can not be longer than %d",
463                                current_item_size, max_node_size);
464                         /* we will try to split it */
465                         flow = 1;
466                 }
467
468                 if (!flow) {
469                         /* as we do not split items, take new node and continue */
470                         needed_nodes++;
471                         i--;
472                         total_node_size = 0;
473                         continue;
474                 }
475                 // calculate number of item units which fit into node being
476                 // filled
477                 {
478                         int free_space;
479
480                         free_space = max_node_size - total_node_size - IH_SIZE;
481                         units =
482                             op_check_left(vi, free_space, start_bytes,
483                                           skip_from_end);
484                         if (units == -1) {
485                                 /* nothing fits into current node, take new node and continue */
486                                 needed_nodes++, i--, total_node_size = 0;
487                                 continue;
488                         }
489                 }
490
491                 /* something fits into the current node */
492                 //if (snum012[3] != -1 || needed_nodes != 1)
493                 //  reiserfs_panic (tb->tb_sb, "vs-8115: get_num_ver: too many nodes required");
494                 //snum012[needed_nodes - 1 + 3] = op_unit_num (vi) - start_bytes - units;
495                 start_bytes += units;
496                 snum012[needed_nodes - 1 + 3] = units;
497
498                 if (needed_nodes > 2)
499                         reiserfs_warning(tb->tb_sb, "vs-8111: get_num_ver: "
500                                          "split_item_position is out of boundary");
501                 snum012[needed_nodes - 1]++;
502                 split_item_positions[needed_nodes - 1] = i;
503                 needed_nodes++;
504                 /* continue from the same item with start_bytes != -1 */
505                 start_item = i;
506                 i--;
507                 total_node_size = 0;
508         }
509
510         // sum012[4] (if it is not -1) contains number of units of which
511         // are to be in S1new, snum012[3] - to be in S0. They are supposed
512         // to be S1bytes and S2bytes correspondingly, so recalculate
513         if (snum012[4] > 0) {
514                 int split_item_num;
515                 int bytes_to_r, bytes_to_l;
516                 int bytes_to_S1new;
517
518                 split_item_num = split_item_positions[1];
519                 bytes_to_l =
520                     ((from == split_item_num
521                       && from_bytes != -1) ? from_bytes : 0);
522                 bytes_to_r =
523                     ((end_item == split_item_num
524                       && end_bytes != -1) ? end_bytes : 0);
525                 bytes_to_S1new =
526                     ((split_item_positions[0] ==
527                       split_item_positions[1]) ? snum012[3] : 0);
528
529                 // s2bytes
530                 snum012[4] =
531                     op_unit_num(&vn->vn_vi[split_item_num]) - snum012[4] -
532                     bytes_to_r - bytes_to_l - bytes_to_S1new;
533
534                 if (vn->vn_vi[split_item_num].vi_index != TYPE_DIRENTRY &&
535                     vn->vn_vi[split_item_num].vi_index != TYPE_INDIRECT)
536                         reiserfs_warning(tb->tb_sb, "vs-8115: get_num_ver: not "
537                                          "directory or indirect item");
538         }
539
540         /* now we know S2bytes, calculate S1bytes */
541         if (snum012[3] > 0) {
542                 int split_item_num;
543                 int bytes_to_r, bytes_to_l;
544                 int bytes_to_S2new;
545
546                 split_item_num = split_item_positions[0];
547                 bytes_to_l =
548                     ((from == split_item_num
549                       && from_bytes != -1) ? from_bytes : 0);
550                 bytes_to_r =
551                     ((end_item == split_item_num
552                       && end_bytes != -1) ? end_bytes : 0);
553                 bytes_to_S2new =
554                     ((split_item_positions[0] == split_item_positions[1]
555                       && snum012[4] != -1) ? snum012[4] : 0);
556
557                 // s1bytes
558                 snum012[3] =
559                     op_unit_num(&vn->vn_vi[split_item_num]) - snum012[3] -
560                     bytes_to_r - bytes_to_l - bytes_to_S2new;
561         }
562
563         return needed_nodes;
564 }
565
566 #ifdef CONFIG_REISERFS_CHECK
567 extern struct tree_balance *cur_tb;
568 #endif
569
570 /* Set parameters for balancing.
571  * Performs write of results of analysis of balancing into structure tb,
572  * where it will later be used by the functions that actually do the balancing. 
573  * Parameters:
574  *      tb      tree_balance structure;
575  *      h       current level of the node;
576  *      lnum    number of items from S[h] that must be shifted to L[h];
577  *      rnum    number of items from S[h] that must be shifted to R[h];
578  *      blk_num number of blocks that S[h] will be splitted into;
579  *      s012    number of items that fall into splitted nodes.
580  *      lbytes  number of bytes which flow to the left neighbor from the item that is not
581  *              not shifted entirely
582  *      rbytes  number of bytes which flow to the right neighbor from the item that is not
583  *              not shifted entirely
584  *      s1bytes number of bytes which flow to the first  new node when S[0] splits (this number is contained in s012 array)
585  */
586
587 static void set_parameters(struct tree_balance *tb, int h, int lnum,
588                            int rnum, int blk_num, short *s012, int lb, int rb)
589 {
590
591         tb->lnum[h] = lnum;
592         tb->rnum[h] = rnum;
593         tb->blknum[h] = blk_num;
594
595         if (h == 0) {           /* only for leaf level */
596                 if (s012 != NULL) {
597                         tb->s0num = *s012++,
598                             tb->s1num = *s012++, tb->s2num = *s012++;
599                         tb->s1bytes = *s012++;
600                         tb->s2bytes = *s012;
601                 }
602                 tb->lbytes = lb;
603                 tb->rbytes = rb;
604         }
605         PROC_INFO_ADD(tb->tb_sb, lnum[h], lnum);
606         PROC_INFO_ADD(tb->tb_sb, rnum[h], rnum);
607
608         PROC_INFO_ADD(tb->tb_sb, lbytes[h], lb);
609         PROC_INFO_ADD(tb->tb_sb, rbytes[h], rb);
610 }
611
612 /* check, does node disappear if we shift tb->lnum[0] items to left
613    neighbor and tb->rnum[0] to the right one. */
614 static int is_leaf_removable(struct tree_balance *tb)
615 {
616         struct virtual_node *vn = tb->tb_vn;
617         int to_left, to_right;
618         int size;
619         int remain_items;
620
621         /* number of items, that will be shifted to left (right) neighbor
622            entirely */
623         to_left = tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0);
624         to_right = tb->rnum[0] - ((tb->rbytes != -1) ? 1 : 0);
625         remain_items = vn->vn_nr_item;
626
627         /* how many items remain in S[0] after shiftings to neighbors */
628         remain_items -= (to_left + to_right);
629
630         if (remain_items < 1) {
631                 /* all content of node can be shifted to neighbors */
632                 set_parameters(tb, 0, to_left, vn->vn_nr_item - to_left, 0,
633                                NULL, -1, -1);
634                 return 1;
635         }
636
637         if (remain_items > 1 || tb->lbytes == -1 || tb->rbytes == -1)
638                 /* S[0] is not removable */
639                 return 0;
640
641         /* check, whether we can divide 1 remaining item between neighbors */
642
643         /* get size of remaining item (in item units) */
644         size = op_unit_num(&(vn->vn_vi[to_left]));
645
646         if (tb->lbytes + tb->rbytes >= size) {
647                 set_parameters(tb, 0, to_left + 1, to_right + 1, 0, NULL,
648                                tb->lbytes, -1);
649                 return 1;
650         }
651
652         return 0;
653 }
654
655 /* check whether L, S, R can be joined in one node */
656 static int are_leaves_removable(struct tree_balance *tb, int lfree, int rfree)
657 {
658         struct virtual_node *vn = tb->tb_vn;
659         int ih_size;
660         struct buffer_head *S0;
661
662         S0 = PATH_H_PBUFFER(tb->tb_path, 0);
663
664         ih_size = 0;
665         if (vn->vn_nr_item) {
666                 if (vn->vn_vi[0].vi_type & VI_TYPE_LEFT_MERGEABLE)
667                         ih_size += IH_SIZE;
668
669                 if (vn->vn_vi[vn->vn_nr_item - 1].
670                     vi_type & VI_TYPE_RIGHT_MERGEABLE)
671                         ih_size += IH_SIZE;
672         } else {
673                 /* there was only one item and it will be deleted */
674                 struct item_head *ih;
675
676                 RFALSE(B_NR_ITEMS(S0) != 1,
677                        "vs-8125: item number must be 1: it is %d",
678                        B_NR_ITEMS(S0));
679
680                 ih = B_N_PITEM_HEAD(S0, 0);
681                 if (tb->CFR[0]
682                     && !comp_short_le_keys(&(ih->ih_key),
683                                            B_N_PDELIM_KEY(tb->CFR[0],
684                                                           tb->rkey[0])))
685                         if (is_direntry_le_ih(ih)) {
686                                 /* Directory must be in correct state here: that is
687                                    somewhere at the left side should exist first directory
688                                    item. But the item being deleted can not be that first
689                                    one because its right neighbor is item of the same
690                                    directory. (But first item always gets deleted in last
691                                    turn). So, neighbors of deleted item can be merged, so
692                                    we can save ih_size */
693                                 ih_size = IH_SIZE;
694
695                                 /* we might check that left neighbor exists and is of the
696                                    same directory */
697                                 RFALSE(le_ih_k_offset(ih) == DOT_OFFSET,
698                                        "vs-8130: first directory item can not be removed until directory is not empty");
699                         }
700
701         }
702
703         if (MAX_CHILD_SIZE(S0) + vn->vn_size <= rfree + lfree + ih_size) {
704                 set_parameters(tb, 0, -1, -1, -1, NULL, -1, -1);
705                 PROC_INFO_INC(tb->tb_sb, leaves_removable);
706                 return 1;
707         }
708         return 0;
709
710 }
711
712 /* when we do not split item, lnum and rnum are numbers of entire items */
713 #define SET_PAR_SHIFT_LEFT \
714 if (h)\
715 {\
716    int to_l;\
717    \
718    to_l = (MAX_NR_KEY(Sh)+1 - lpar + vn->vn_nr_item + 1) / 2 -\
719               (MAX_NR_KEY(Sh) + 1 - lpar);\
720               \
721               set_parameters (tb, h, to_l, 0, lnver, NULL, -1, -1);\
722 }\
723 else \
724 {\
725    if (lset==LEFT_SHIFT_FLOW)\
726      set_parameters (tb, h, lpar, 0, lnver, snum012+lset,\
727                      tb->lbytes, -1);\
728    else\
729      set_parameters (tb, h, lpar - (tb->lbytes!=-1), 0, lnver, snum012+lset,\
730                      -1, -1);\
731 }
732
733 #define SET_PAR_SHIFT_RIGHT \
734 if (h)\
735 {\
736    int to_r;\
737    \
738    to_r = (MAX_NR_KEY(Sh)+1 - rpar + vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - rpar);\
739    \
740    set_parameters (tb, h, 0, to_r, rnver, NULL, -1, -1);\
741 }\
742 else \
743 {\
744    if (rset==RIGHT_SHIFT_FLOW)\
745      set_parameters (tb, h, 0, rpar, rnver, snum012+rset,\
746                   -1, tb->rbytes);\
747    else\
748      set_parameters (tb, h, 0, rpar - (tb->rbytes!=-1), rnver, snum012+rset,\
749                   -1, -1);\
750 }
751
752 static void free_buffers_in_tb(struct tree_balance *p_s_tb)
753 {
754         int n_counter;
755
756         decrement_counters_in_path(p_s_tb->tb_path);
757
758         for (n_counter = 0; n_counter < MAX_HEIGHT; n_counter++) {
759                 decrement_bcount(p_s_tb->L[n_counter]);
760                 p_s_tb->L[n_counter] = NULL;
761                 decrement_bcount(p_s_tb->R[n_counter]);
762                 p_s_tb->R[n_counter] = NULL;
763                 decrement_bcount(p_s_tb->FL[n_counter]);
764                 p_s_tb->FL[n_counter] = NULL;
765                 decrement_bcount(p_s_tb->FR[n_counter]);
766                 p_s_tb->FR[n_counter] = NULL;
767                 decrement_bcount(p_s_tb->CFL[n_counter]);
768                 p_s_tb->CFL[n_counter] = NULL;
769                 decrement_bcount(p_s_tb->CFR[n_counter]);
770                 p_s_tb->CFR[n_counter] = NULL;
771         }
772 }
773
774 /* Get new buffers for storing new nodes that are created while balancing.
775  * Returns:     SCHEDULE_OCCURRED - schedule occurred while the function worked;
776  *              CARRY_ON - schedule didn't occur while the function worked;
777  *              NO_DISK_SPACE - no disk space.
778  */
779 /* The function is NOT SCHEDULE-SAFE! */
780 static int get_empty_nodes(struct tree_balance *p_s_tb, int n_h)
781 {
782         struct buffer_head *p_s_new_bh,
783             *p_s_Sh = PATH_H_PBUFFER(p_s_tb->tb_path, n_h);
784         b_blocknr_t *p_n_blocknr, a_n_blocknrs[MAX_AMOUNT_NEEDED] = { 0, };
785         int n_counter, n_number_of_freeblk, n_amount_needed,    /* number of needed empty blocks */
786          n_retval = CARRY_ON;
787         struct super_block *p_s_sb = p_s_tb->tb_sb;
788
789         /* number_of_freeblk is the number of empty blocks which have been
790            acquired for use by the balancing algorithm minus the number of
791            empty blocks used in the previous levels of the analysis,
792            number_of_freeblk = tb->cur_blknum can be non-zero if a schedule occurs
793            after empty blocks are acquired, and the balancing analysis is
794            then restarted, amount_needed is the number needed by this level
795            (n_h) of the balancing analysis.
796
797            Note that for systems with many processes writing, it would be
798            more layout optimal to calculate the total number needed by all
799            levels and then to run reiserfs_new_blocks to get all of them at once.  */
800
801         /* Initiate number_of_freeblk to the amount acquired prior to the restart of
802            the analysis or 0 if not restarted, then subtract the amount needed
803            by all of the levels of the tree below n_h. */
804         /* blknum includes S[n_h], so we subtract 1 in this calculation */
805         for (n_counter = 0, n_number_of_freeblk = p_s_tb->cur_blknum;
806              n_counter < n_h; n_counter++)
807                 n_number_of_freeblk -=
808                     (p_s_tb->blknum[n_counter]) ? (p_s_tb->blknum[n_counter] -
809                                                    1) : 0;
810
811         /* Allocate missing empty blocks. */
812         /* if p_s_Sh == 0  then we are getting a new root */
813         n_amount_needed = (p_s_Sh) ? (p_s_tb->blknum[n_h] - 1) : 1;
814         /*  Amount_needed = the amount that we need more than the amount that we have. */
815         if (n_amount_needed > n_number_of_freeblk)
816                 n_amount_needed -= n_number_of_freeblk;
817         else                    /* If we have enough already then there is nothing to do. */
818                 return CARRY_ON;
819
820         /* No need to check quota - is not allocated for blocks used for formatted nodes */
821         if (reiserfs_new_form_blocknrs(p_s_tb, a_n_blocknrs,
822                                        n_amount_needed) == NO_DISK_SPACE)
823                 return NO_DISK_SPACE;
824
825         /* for each blocknumber we just got, get a buffer and stick it on FEB */
826         for (p_n_blocknr = a_n_blocknrs, n_counter = 0;
827              n_counter < n_amount_needed; p_n_blocknr++, n_counter++) {
828
829                 RFALSE(!*p_n_blocknr,
830                        "PAP-8135: reiserfs_new_blocknrs failed when got new blocks");
831
832                 p_s_new_bh = sb_getblk(p_s_sb, *p_n_blocknr);
833                 RFALSE(buffer_dirty(p_s_new_bh) ||
834                        buffer_journaled(p_s_new_bh) ||
835                        buffer_journal_dirty(p_s_new_bh),
836                        "PAP-8140: journlaled or dirty buffer %b for the new block",
837                        p_s_new_bh);
838
839                 /* Put empty buffers into the array. */
840                 RFALSE(p_s_tb->FEB[p_s_tb->cur_blknum],
841                        "PAP-8141: busy slot for new buffer");
842
843                 set_buffer_journal_new(p_s_new_bh);
844                 p_s_tb->FEB[p_s_tb->cur_blknum++] = p_s_new_bh;
845         }
846
847         if (n_retval == CARRY_ON && FILESYSTEM_CHANGED_TB(p_s_tb))
848                 n_retval = REPEAT_SEARCH;
849
850         return n_retval;
851 }
852
853 /* Get free space of the left neighbor, which is stored in the parent
854  * node of the left neighbor.  */
855 static int get_lfree(struct tree_balance *tb, int h)
856 {
857         struct buffer_head *l, *f;
858         int order;
859
860         if ((f = PATH_H_PPARENT(tb->tb_path, h)) == 0 || (l = tb->FL[h]) == 0)
861                 return 0;
862
863         if (f == l)
864                 order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) - 1;
865         else {
866                 order = B_NR_ITEMS(l);
867                 f = l;
868         }
869
870         return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order)));
871 }
872
873 /* Get free space of the right neighbor,
874  * which is stored in the parent node of the right neighbor.
875  */
876 static int get_rfree(struct tree_balance *tb, int h)
877 {
878         struct buffer_head *r, *f;
879         int order;
880
881         if ((f = PATH_H_PPARENT(tb->tb_path, h)) == 0 || (r = tb->FR[h]) == 0)
882                 return 0;
883
884         if (f == r)
885                 order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) + 1;
886         else {
887                 order = 0;
888                 f = r;
889         }
890
891         return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order)));
892
893 }
894
895 /* Check whether left neighbor is in memory. */
896 static int is_left_neighbor_in_cache(struct tree_balance *p_s_tb, int n_h)
897 {
898         struct buffer_head *p_s_father, *left;
899         struct super_block *p_s_sb = p_s_tb->tb_sb;
900         b_blocknr_t n_left_neighbor_blocknr;
901         int n_left_neighbor_position;
902
903         if (!p_s_tb->FL[n_h])   /* Father of the left neighbor does not exist. */
904                 return 0;
905
906         /* Calculate father of the node to be balanced. */
907         p_s_father = PATH_H_PBUFFER(p_s_tb->tb_path, n_h + 1);
908
909         RFALSE(!p_s_father ||
910                !B_IS_IN_TREE(p_s_father) ||
911                !B_IS_IN_TREE(p_s_tb->FL[n_h]) ||
912                !buffer_uptodate(p_s_father) ||
913                !buffer_uptodate(p_s_tb->FL[n_h]),
914                "vs-8165: F[h] (%b) or FL[h] (%b) is invalid",
915                p_s_father, p_s_tb->FL[n_h]);
916
917         /* Get position of the pointer to the left neighbor into the left father. */
918         n_left_neighbor_position = (p_s_father == p_s_tb->FL[n_h]) ?
919             p_s_tb->lkey[n_h] : B_NR_ITEMS(p_s_tb->FL[n_h]);
920         /* Get left neighbor block number. */
921         n_left_neighbor_blocknr =
922             B_N_CHILD_NUM(p_s_tb->FL[n_h], n_left_neighbor_position);
923         /* Look for the left neighbor in the cache. */
924         if ((left = sb_find_get_block(p_s_sb, n_left_neighbor_blocknr))) {
925
926                 RFALSE(buffer_uptodate(left) && !B_IS_IN_TREE(left),
927                        "vs-8170: left neighbor (%b %z) is not in the tree",
928                        left, left);
929                 put_bh(left);
930                 return 1;
931         }
932
933         return 0;
934 }
935
936 #define LEFT_PARENTS  'l'
937 #define RIGHT_PARENTS 'r'
938
939 static void decrement_key(struct cpu_key *p_s_key)
940 {
941         // call item specific function for this key
942         item_ops[cpu_key_k_type(p_s_key)]->decrement_key(p_s_key);
943 }
944
945 /* Calculate far left/right parent of the left/right neighbor of the current node, that
946  * is calculate the left/right (FL[h]/FR[h]) neighbor of the parent F[h].
947  * Calculate left/right common parent of the current node and L[h]/R[h].
948  * Calculate left/right delimiting key position.
949  * Returns:     PATH_INCORRECT   - path in the tree is not correct;
950                 SCHEDULE_OCCURRED - schedule occurred while the function worked;
951  *              CARRY_ON         - schedule didn't occur while the function worked;
952  */
953 static int get_far_parent(struct tree_balance *p_s_tb,
954                           int n_h,
955                           struct buffer_head **pp_s_father,
956                           struct buffer_head **pp_s_com_father, char c_lr_par)
957 {
958         struct buffer_head *p_s_parent;
959         INITIALIZE_PATH(s_path_to_neighbor_father);
960         struct treepath *p_s_path = p_s_tb->tb_path;
961         struct cpu_key s_lr_father_key;
962         int n_counter,
963             n_position = INT_MAX,
964             n_first_last_position = 0,
965             n_path_offset = PATH_H_PATH_OFFSET(p_s_path, n_h);
966
967         /* Starting from F[n_h] go upwards in the tree, and look for the common
968            ancestor of F[n_h], and its neighbor l/r, that should be obtained. */
969
970         n_counter = n_path_offset;
971
972         RFALSE(n_counter < FIRST_PATH_ELEMENT_OFFSET,
973                "PAP-8180: invalid path length");
974
975         for (; n_counter > FIRST_PATH_ELEMENT_OFFSET; n_counter--) {
976                 /* Check whether parent of the current buffer in the path is really parent in the tree. */
977                 if (!B_IS_IN_TREE
978                     (p_s_parent = PATH_OFFSET_PBUFFER(p_s_path, n_counter - 1)))
979                         return REPEAT_SEARCH;
980                 /* Check whether position in the parent is correct. */
981                 if ((n_position =
982                      PATH_OFFSET_POSITION(p_s_path,
983                                           n_counter - 1)) >
984                     B_NR_ITEMS(p_s_parent))
985                         return REPEAT_SEARCH;
986                 /* Check whether parent at the path really points to the child. */
987                 if (B_N_CHILD_NUM(p_s_parent, n_position) !=
988                     PATH_OFFSET_PBUFFER(p_s_path, n_counter)->b_blocknr)
989                         return REPEAT_SEARCH;
990                 /* Return delimiting key if position in the parent is not equal to first/last one. */
991                 if (c_lr_par == RIGHT_PARENTS)
992                         n_first_last_position = B_NR_ITEMS(p_s_parent);
993                 if (n_position != n_first_last_position) {
994                         *pp_s_com_father = p_s_parent;
995                         get_bh(*pp_s_com_father);
996                         /*(*pp_s_com_father = p_s_parent)->b_count++; */
997                         break;
998                 }
999         }
1000
1001         /* if we are in the root of the tree, then there is no common father */
1002         if (n_counter == FIRST_PATH_ELEMENT_OFFSET) {
1003                 /* Check whether first buffer in the path is the root of the tree. */
1004                 if (PATH_OFFSET_PBUFFER
1005                     (p_s_tb->tb_path,
1006                      FIRST_PATH_ELEMENT_OFFSET)->b_blocknr ==
1007                     SB_ROOT_BLOCK(p_s_tb->tb_sb)) {
1008                         *pp_s_father = *pp_s_com_father = NULL;
1009                         return CARRY_ON;
1010                 }
1011                 return REPEAT_SEARCH;
1012         }
1013
1014         RFALSE(B_LEVEL(*pp_s_com_father) <= DISK_LEAF_NODE_LEVEL,
1015                "PAP-8185: (%b %z) level too small",
1016                *pp_s_com_father, *pp_s_com_father);
1017
1018         /* Check whether the common parent is locked. */
1019
1020         if (buffer_locked(*pp_s_com_father)) {
1021                 __wait_on_buffer(*pp_s_com_father);
1022                 if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
1023                         decrement_bcount(*pp_s_com_father);
1024                         return REPEAT_SEARCH;
1025                 }
1026         }
1027
1028         /* So, we got common parent of the current node and its left/right neighbor.
1029            Now we are geting the parent of the left/right neighbor. */
1030
1031         /* Form key to get parent of the left/right neighbor. */
1032         le_key2cpu_key(&s_lr_father_key,
1033                        B_N_PDELIM_KEY(*pp_s_com_father,
1034                                       (c_lr_par ==
1035                                        LEFT_PARENTS) ? (p_s_tb->lkey[n_h - 1] =
1036                                                         n_position -
1037                                                         1) : (p_s_tb->rkey[n_h -
1038                                                                            1] =
1039                                                               n_position)));
1040
1041         if (c_lr_par == LEFT_PARENTS)
1042                 decrement_key(&s_lr_father_key);
1043
1044         if (search_by_key
1045             (p_s_tb->tb_sb, &s_lr_father_key, &s_path_to_neighbor_father,
1046              n_h + 1) == IO_ERROR)
1047                 // path is released
1048                 return IO_ERROR;
1049
1050         if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
1051                 decrement_counters_in_path(&s_path_to_neighbor_father);
1052                 decrement_bcount(*pp_s_com_father);
1053                 return REPEAT_SEARCH;
1054         }
1055
1056         *pp_s_father = PATH_PLAST_BUFFER(&s_path_to_neighbor_father);
1057
1058         RFALSE(B_LEVEL(*pp_s_father) != n_h + 1,
1059                "PAP-8190: (%b %z) level too small", *pp_s_father, *pp_s_father);
1060         RFALSE(s_path_to_neighbor_father.path_length <
1061                FIRST_PATH_ELEMENT_OFFSET, "PAP-8192: path length is too small");
1062
1063         s_path_to_neighbor_father.path_length--;
1064         decrement_counters_in_path(&s_path_to_neighbor_father);
1065         return CARRY_ON;
1066 }
1067
1068 /* Get parents of neighbors of node in the path(S[n_path_offset]) and common parents of
1069  * S[n_path_offset] and L[n_path_offset]/R[n_path_offset]: F[n_path_offset], FL[n_path_offset],
1070  * FR[n_path_offset], CFL[n_path_offset], CFR[n_path_offset].
1071  * Calculate numbers of left and right delimiting keys position: lkey[n_path_offset], rkey[n_path_offset].
1072  * Returns:     SCHEDULE_OCCURRED - schedule occurred while the function worked;
1073  *              CARRY_ON - schedule didn't occur while the function worked;
1074  */
1075 static int get_parents(struct tree_balance *p_s_tb, int n_h)
1076 {
1077         struct treepath *p_s_path = p_s_tb->tb_path;
1078         int n_position,
1079             n_ret_value,
1080             n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h);
1081         struct buffer_head *p_s_curf, *p_s_curcf;
1082
1083         /* Current node is the root of the tree or will be root of the tree */
1084         if (n_path_offset <= FIRST_PATH_ELEMENT_OFFSET) {
1085                 /* The root can not have parents.
1086                    Release nodes which previously were obtained as parents of the current node neighbors. */
1087                 decrement_bcount(p_s_tb->FL[n_h]);
1088                 decrement_bcount(p_s_tb->CFL[n_h]);
1089                 decrement_bcount(p_s_tb->FR[n_h]);
1090                 decrement_bcount(p_s_tb->CFR[n_h]);
1091                 p_s_tb->FL[n_h] = p_s_tb->CFL[n_h] = p_s_tb->FR[n_h] =
1092                     p_s_tb->CFR[n_h] = NULL;
1093                 return CARRY_ON;
1094         }
1095
1096         /* Get parent FL[n_path_offset] of L[n_path_offset]. */
1097         if ((n_position = PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1))) {
1098                 /* Current node is not the first child of its parent. */
1099                 /*(p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1))->b_count += 2; */
1100                 p_s_curf = p_s_curcf =
1101                     PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1);
1102                 get_bh(p_s_curf);
1103                 get_bh(p_s_curf);
1104                 p_s_tb->lkey[n_h] = n_position - 1;
1105         } else {
1106                 /* Calculate current parent of L[n_path_offset], which is the left neighbor of the current node.
1107                    Calculate current common parent of L[n_path_offset] and the current node. Note that
1108                    CFL[n_path_offset] not equal FL[n_path_offset] and CFL[n_path_offset] not equal F[n_path_offset].
1109                    Calculate lkey[n_path_offset]. */
1110                 if ((n_ret_value = get_far_parent(p_s_tb, n_h + 1, &p_s_curf,
1111                                                   &p_s_curcf,
1112                                                   LEFT_PARENTS)) != CARRY_ON)
1113                         return n_ret_value;
1114         }
1115
1116         decrement_bcount(p_s_tb->FL[n_h]);
1117         p_s_tb->FL[n_h] = p_s_curf;     /* New initialization of FL[n_h]. */
1118         decrement_bcount(p_s_tb->CFL[n_h]);
1119         p_s_tb->CFL[n_h] = p_s_curcf;   /* New initialization of CFL[n_h]. */
1120
1121         RFALSE((p_s_curf && !B_IS_IN_TREE(p_s_curf)) ||
1122                (p_s_curcf && !B_IS_IN_TREE(p_s_curcf)),
1123                "PAP-8195: FL (%b) or CFL (%b) is invalid", p_s_curf, p_s_curcf);
1124
1125 /* Get parent FR[n_h] of R[n_h]. */
1126
1127 /* Current node is the last child of F[n_h]. FR[n_h] != F[n_h]. */
1128         if (n_position == B_NR_ITEMS(PATH_H_PBUFFER(p_s_path, n_h + 1))) {
1129 /* Calculate current parent of R[n_h], which is the right neighbor of F[n_h].
1130    Calculate current common parent of R[n_h] and current node. Note that CFR[n_h]
1131    not equal FR[n_path_offset] and CFR[n_h] not equal F[n_h]. */
1132                 if ((n_ret_value =
1133                      get_far_parent(p_s_tb, n_h + 1, &p_s_curf, &p_s_curcf,
1134                                     RIGHT_PARENTS)) != CARRY_ON)
1135                         return n_ret_value;
1136         } else {
1137 /* Current node is not the last child of its parent F[n_h]. */
1138                 /*(p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1))->b_count += 2; */
1139                 p_s_curf = p_s_curcf =
1140                     PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1);
1141                 get_bh(p_s_curf);
1142                 get_bh(p_s_curf);
1143                 p_s_tb->rkey[n_h] = n_position;
1144         }
1145
1146         decrement_bcount(p_s_tb->FR[n_h]);
1147         p_s_tb->FR[n_h] = p_s_curf;     /* New initialization of FR[n_path_offset]. */
1148
1149         decrement_bcount(p_s_tb->CFR[n_h]);
1150         p_s_tb->CFR[n_h] = p_s_curcf;   /* New initialization of CFR[n_path_offset]. */
1151
1152         RFALSE((p_s_curf && !B_IS_IN_TREE(p_s_curf)) ||
1153                (p_s_curcf && !B_IS_IN_TREE(p_s_curcf)),
1154                "PAP-8205: FR (%b) or CFR (%b) is invalid", p_s_curf, p_s_curcf);
1155
1156         return CARRY_ON;
1157 }
1158
1159 /* it is possible to remove node as result of shiftings to
1160    neighbors even when we insert or paste item. */
1161 static inline int can_node_be_removed(int mode, int lfree, int sfree, int rfree,
1162                                       struct tree_balance *tb, int h)
1163 {
1164         struct buffer_head *Sh = PATH_H_PBUFFER(tb->tb_path, h);
1165         int levbytes = tb->insert_size[h];
1166         struct item_head *ih;
1167         struct reiserfs_key *r_key = NULL;
1168
1169         ih = B_N_PITEM_HEAD(Sh, 0);
1170         if (tb->CFR[h])
1171                 r_key = B_N_PDELIM_KEY(tb->CFR[h], tb->rkey[h]);
1172
1173         if (lfree + rfree + sfree < MAX_CHILD_SIZE(Sh) + levbytes
1174             /* shifting may merge items which might save space */
1175             -
1176             ((!h
1177               && op_is_left_mergeable(&(ih->ih_key), Sh->b_size)) ? IH_SIZE : 0)
1178             -
1179             ((!h && r_key
1180               && op_is_left_mergeable(r_key, Sh->b_size)) ? IH_SIZE : 0)
1181             + ((h) ? KEY_SIZE : 0)) {
1182                 /* node can not be removed */
1183                 if (sfree >= levbytes) {        /* new item fits into node S[h] without any shifting */
1184                         if (!h)
1185                                 tb->s0num =
1186                                     B_NR_ITEMS(Sh) +
1187                                     ((mode == M_INSERT) ? 1 : 0);
1188                         set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1189                         return NO_BALANCING_NEEDED;
1190                 }
1191         }
1192         PROC_INFO_INC(tb->tb_sb, can_node_be_removed[h]);
1193         return !NO_BALANCING_NEEDED;
1194 }
1195
1196 /* Check whether current node S[h] is balanced when increasing its size by
1197  * Inserting or Pasting.
1198  * Calculate parameters for balancing for current level h.
1199  * Parameters:
1200  *      tb      tree_balance structure;
1201  *      h       current level of the node;
1202  *      inum    item number in S[h];
1203  *      mode    i - insert, p - paste;
1204  * Returns:     1 - schedule occurred; 
1205  *              0 - balancing for higher levels needed;
1206  *             -1 - no balancing for higher levels needed;
1207  *             -2 - no disk space.
1208  */
1209 /* ip means Inserting or Pasting */
1210 static int ip_check_balance(struct tree_balance *tb, int h)
1211 {
1212         struct virtual_node *vn = tb->tb_vn;
1213         int levbytes,           /* Number of bytes that must be inserted into (value
1214                                    is negative if bytes are deleted) buffer which
1215                                    contains node being balanced.  The mnemonic is
1216                                    that the attempted change in node space used level
1217                                    is levbytes bytes. */
1218          n_ret_value;
1219
1220         int lfree, sfree, rfree /* free space in L, S and R */ ;
1221
1222         /* nver is short for number of vertixes, and lnver is the number if
1223            we shift to the left, rnver is the number if we shift to the
1224            right, and lrnver is the number if we shift in both directions.
1225            The goal is to minimize first the number of vertixes, and second,
1226            the number of vertixes whose contents are changed by shifting,
1227            and third the number of uncached vertixes whose contents are
1228            changed by shifting and must be read from disk.  */
1229         int nver, lnver, rnver, lrnver;
1230
1231         /* used at leaf level only, S0 = S[0] is the node being balanced,
1232            sInum [ I = 0,1,2 ] is the number of items that will
1233            remain in node SI after balancing.  S1 and S2 are new
1234            nodes that might be created. */
1235
1236         /* we perform 8 calls to get_num_ver().  For each call we calculate five parameters.
1237            where 4th parameter is s1bytes and 5th - s2bytes
1238          */
1239         short snum012[40] = { 0, };     /* s0num, s1num, s2num for 8 cases 
1240                                            0,1 - do not shift and do not shift but bottle
1241                                            2 - shift only whole item to left
1242                                            3 - shift to left and bottle as much as possible
1243                                            4,5 - shift to right (whole items and as much as possible
1244                                            6,7 - shift to both directions (whole items and as much as possible)
1245                                          */
1246
1247         /* Sh is the node whose balance is currently being checked */
1248         struct buffer_head *Sh;
1249
1250         Sh = PATH_H_PBUFFER(tb->tb_path, h);
1251         levbytes = tb->insert_size[h];
1252
1253         /* Calculate balance parameters for creating new root. */
1254         if (!Sh) {
1255                 if (!h)
1256                         reiserfs_panic(tb->tb_sb,
1257                                        "vs-8210: ip_check_balance: S[0] can not be 0");
1258                 switch (n_ret_value = get_empty_nodes(tb, h)) {
1259                 case CARRY_ON:
1260                         set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1261                         return NO_BALANCING_NEEDED;     /* no balancing for higher levels needed */
1262
1263                 case NO_DISK_SPACE:
1264                 case REPEAT_SEARCH:
1265                         return n_ret_value;
1266                 default:
1267                         reiserfs_panic(tb->tb_sb,
1268                                        "vs-8215: ip_check_balance: incorrect return value of get_empty_nodes");
1269                 }
1270         }
1271
1272         if ((n_ret_value = get_parents(tb, h)) != CARRY_ON)     /* get parents of S[h] neighbors. */
1273                 return n_ret_value;
1274
1275         sfree = B_FREE_SPACE(Sh);
1276
1277         /* get free space of neighbors */
1278         rfree = get_rfree(tb, h);
1279         lfree = get_lfree(tb, h);
1280
1281         if (can_node_be_removed(vn->vn_mode, lfree, sfree, rfree, tb, h) ==
1282             NO_BALANCING_NEEDED)
1283                 /* and new item fits into node S[h] without any shifting */
1284                 return NO_BALANCING_NEEDED;
1285
1286         create_virtual_node(tb, h);
1287
1288         /*  
1289            determine maximal number of items we can shift to the left neighbor (in tb structure)
1290            and the maximal number of bytes that can flow to the left neighbor
1291            from the left most liquid item that cannot be shifted from S[0] entirely (returned value)
1292          */
1293         check_left(tb, h, lfree);
1294
1295         /*
1296            determine maximal number of items we can shift to the right neighbor (in tb structure)
1297            and the maximal number of bytes that can flow to the right neighbor
1298            from the right most liquid item that cannot be shifted from S[0] entirely (returned value)
1299          */
1300         check_right(tb, h, rfree);
1301
1302         /* all contents of internal node S[h] can be moved into its
1303            neighbors, S[h] will be removed after balancing */
1304         if (h && (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)) {
1305                 int to_r;
1306
1307                 /* Since we are working on internal nodes, and our internal
1308                    nodes have fixed size entries, then we can balance by the
1309                    number of items rather than the space they consume.  In this
1310                    routine we set the left node equal to the right node,
1311                    allowing a difference of less than or equal to 1 child
1312                    pointer. */
1313                 to_r =
1314                     ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] +
1315                      vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 -
1316                                                 tb->rnum[h]);
1317                 set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL,
1318                                -1, -1);
1319                 return CARRY_ON;
1320         }
1321
1322         /* this checks balance condition, that any two neighboring nodes can not fit in one node */
1323         RFALSE(h &&
1324                (tb->lnum[h] >= vn->vn_nr_item + 1 ||
1325                 tb->rnum[h] >= vn->vn_nr_item + 1),
1326                "vs-8220: tree is not balanced on internal level");
1327         RFALSE(!h && ((tb->lnum[h] >= vn->vn_nr_item && (tb->lbytes == -1)) ||
1328                       (tb->rnum[h] >= vn->vn_nr_item && (tb->rbytes == -1))),
1329                "vs-8225: tree is not balanced on leaf level");
1330
1331         /* all contents of S[0] can be moved into its neighbors
1332            S[0] will be removed after balancing. */
1333         if (!h && is_leaf_removable(tb))
1334                 return CARRY_ON;
1335
1336         /* why do we perform this check here rather than earlier??
1337            Answer: we can win 1 node in some cases above. Moreover we
1338            checked it above, when we checked, that S[0] is not removable
1339            in principle */
1340         if (sfree >= levbytes) {        /* new item fits into node S[h] without any shifting */
1341                 if (!h)
1342                         tb->s0num = vn->vn_nr_item;
1343                 set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1344                 return NO_BALANCING_NEEDED;
1345         }
1346
1347         {
1348                 int lpar, rpar, nset, lset, rset, lrset;
1349                 /* 
1350                  * regular overflowing of the node
1351                  */
1352
1353                 /* get_num_ver works in 2 modes (FLOW & NO_FLOW) 
1354                    lpar, rpar - number of items we can shift to left/right neighbor (including splitting item)
1355                    nset, lset, rset, lrset - shows, whether flowing items give better packing 
1356                  */
1357 #define FLOW 1
1358 #define NO_FLOW 0               /* do not any splitting */
1359
1360                 /* we choose one the following */
1361 #define NOTHING_SHIFT_NO_FLOW   0
1362 #define NOTHING_SHIFT_FLOW      5
1363 #define LEFT_SHIFT_NO_FLOW      10
1364 #define LEFT_SHIFT_FLOW         15
1365 #define RIGHT_SHIFT_NO_FLOW     20
1366 #define RIGHT_SHIFT_FLOW        25
1367 #define LR_SHIFT_NO_FLOW        30
1368 #define LR_SHIFT_FLOW           35
1369
1370                 lpar = tb->lnum[h];
1371                 rpar = tb->rnum[h];
1372
1373                 /* calculate number of blocks S[h] must be split into when
1374                    nothing is shifted to the neighbors,
1375                    as well as number of items in each part of the split node (s012 numbers),
1376                    and number of bytes (s1bytes) of the shared drop which flow to S1 if any */
1377                 nset = NOTHING_SHIFT_NO_FLOW;
1378                 nver = get_num_ver(vn->vn_mode, tb, h,
1379                                    0, -1, h ? vn->vn_nr_item : 0, -1,
1380                                    snum012, NO_FLOW);
1381
1382                 if (!h) {
1383                         int nver1;
1384
1385                         /* note, that in this case we try to bottle between S[0] and S1 (S1 - the first new node) */
1386                         nver1 = get_num_ver(vn->vn_mode, tb, h,
1387                                             0, -1, 0, -1,
1388                                             snum012 + NOTHING_SHIFT_FLOW, FLOW);
1389                         if (nver > nver1)
1390                                 nset = NOTHING_SHIFT_FLOW, nver = nver1;
1391                 }
1392
1393                 /* calculate number of blocks S[h] must be split into when
1394                    l_shift_num first items and l_shift_bytes of the right most
1395                    liquid item to be shifted are shifted to the left neighbor,
1396                    as well as number of items in each part of the splitted node (s012 numbers),
1397                    and number of bytes (s1bytes) of the shared drop which flow to S1 if any
1398                  */
1399                 lset = LEFT_SHIFT_NO_FLOW;
1400                 lnver = get_num_ver(vn->vn_mode, tb, h,
1401                                     lpar - ((h || tb->lbytes == -1) ? 0 : 1),
1402                                     -1, h ? vn->vn_nr_item : 0, -1,
1403                                     snum012 + LEFT_SHIFT_NO_FLOW, NO_FLOW);
1404                 if (!h) {
1405                         int lnver1;
1406
1407                         lnver1 = get_num_ver(vn->vn_mode, tb, h,
1408                                              lpar -
1409                                              ((tb->lbytes != -1) ? 1 : 0),
1410                                              tb->lbytes, 0, -1,
1411                                              snum012 + LEFT_SHIFT_FLOW, FLOW);
1412                         if (lnver > lnver1)
1413                                 lset = LEFT_SHIFT_FLOW, lnver = lnver1;
1414                 }
1415
1416                 /* calculate number of blocks S[h] must be split into when
1417                    r_shift_num first items and r_shift_bytes of the left most
1418                    liquid item to be shifted are shifted to the right neighbor,
1419                    as well as number of items in each part of the splitted node (s012 numbers),
1420                    and number of bytes (s1bytes) of the shared drop which flow to S1 if any
1421                  */
1422                 rset = RIGHT_SHIFT_NO_FLOW;
1423                 rnver = get_num_ver(vn->vn_mode, tb, h,
1424                                     0, -1,
1425                                     h ? (vn->vn_nr_item - rpar) : (rpar -
1426                                                                    ((tb->
1427                                                                      rbytes !=
1428                                                                      -1) ? 1 :
1429                                                                     0)), -1,
1430                                     snum012 + RIGHT_SHIFT_NO_FLOW, NO_FLOW);
1431                 if (!h) {
1432                         int rnver1;
1433
1434                         rnver1 = get_num_ver(vn->vn_mode, tb, h,
1435                                              0, -1,
1436                                              (rpar -
1437                                               ((tb->rbytes != -1) ? 1 : 0)),
1438                                              tb->rbytes,
1439                                              snum012 + RIGHT_SHIFT_FLOW, FLOW);
1440
1441                         if (rnver > rnver1)
1442                                 rset = RIGHT_SHIFT_FLOW, rnver = rnver1;
1443                 }
1444
1445                 /* calculate number of blocks S[h] must be split into when
1446                    items are shifted in both directions,
1447                    as well as number of items in each part of the splitted node (s012 numbers),
1448                    and number of bytes (s1bytes) of the shared drop which flow to S1 if any
1449                  */
1450                 lrset = LR_SHIFT_NO_FLOW;
1451                 lrnver = get_num_ver(vn->vn_mode, tb, h,
1452                                      lpar - ((h || tb->lbytes == -1) ? 0 : 1),
1453                                      -1,
1454                                      h ? (vn->vn_nr_item - rpar) : (rpar -
1455                                                                     ((tb->
1456                                                                       rbytes !=
1457                                                                       -1) ? 1 :
1458                                                                      0)), -1,
1459                                      snum012 + LR_SHIFT_NO_FLOW, NO_FLOW);
1460                 if (!h) {
1461                         int lrnver1;
1462
1463                         lrnver1 = get_num_ver(vn->vn_mode, tb, h,
1464                                               lpar -
1465                                               ((tb->lbytes != -1) ? 1 : 0),
1466                                               tb->lbytes,
1467                                               (rpar -
1468                                                ((tb->rbytes != -1) ? 1 : 0)),
1469                                               tb->rbytes,
1470                                               snum012 + LR_SHIFT_FLOW, FLOW);
1471                         if (lrnver > lrnver1)
1472                                 lrset = LR_SHIFT_FLOW, lrnver = lrnver1;
1473                 }
1474
1475                 /* Our general shifting strategy is:
1476                    1) to minimized number of new nodes;
1477                    2) to minimized number of neighbors involved in shifting;
1478                    3) to minimized number of disk reads; */
1479
1480                 /* we can win TWO or ONE nodes by shifting in both directions */
1481                 if (lrnver < lnver && lrnver < rnver) {
1482                         RFALSE(h &&
1483                                (tb->lnum[h] != 1 ||
1484                                 tb->rnum[h] != 1 ||
1485                                 lrnver != 1 || rnver != 2 || lnver != 2
1486                                 || h != 1), "vs-8230: bad h");
1487                         if (lrset == LR_SHIFT_FLOW)
1488                                 set_parameters(tb, h, tb->lnum[h], tb->rnum[h],
1489                                                lrnver, snum012 + lrset,
1490                                                tb->lbytes, tb->rbytes);
1491                         else
1492                                 set_parameters(tb, h,
1493                                                tb->lnum[h] -
1494                                                ((tb->lbytes == -1) ? 0 : 1),
1495                                                tb->rnum[h] -
1496                                                ((tb->rbytes == -1) ? 0 : 1),
1497                                                lrnver, snum012 + lrset, -1, -1);
1498
1499                         return CARRY_ON;
1500                 }
1501
1502                 /* if shifting doesn't lead to better packing then don't shift */
1503                 if (nver == lrnver) {
1504                         set_parameters(tb, h, 0, 0, nver, snum012 + nset, -1,
1505                                        -1);
1506                         return CARRY_ON;
1507                 }
1508
1509                 /* now we know that for better packing shifting in only one
1510                    direction either to the left or to the right is required */
1511
1512                 /*  if shifting to the left is better than shifting to the right */
1513                 if (lnver < rnver) {
1514                         SET_PAR_SHIFT_LEFT;
1515                         return CARRY_ON;
1516                 }
1517
1518                 /* if shifting to the right is better than shifting to the left */
1519                 if (lnver > rnver) {
1520                         SET_PAR_SHIFT_RIGHT;
1521                         return CARRY_ON;
1522                 }
1523
1524                 /* now shifting in either direction gives the same number
1525                    of nodes and we can make use of the cached neighbors */
1526                 if (is_left_neighbor_in_cache(tb, h)) {
1527                         SET_PAR_SHIFT_LEFT;
1528                         return CARRY_ON;
1529                 }
1530
1531                 /* shift to the right independently on whether the right neighbor in cache or not */
1532                 SET_PAR_SHIFT_RIGHT;
1533                 return CARRY_ON;
1534         }
1535 }
1536
1537 /* Check whether current node S[h] is balanced when Decreasing its size by
1538  * Deleting or Cutting for INTERNAL node of S+tree.
1539  * Calculate parameters for balancing for current level h.
1540  * Parameters:
1541  *      tb      tree_balance structure;
1542  *      h       current level of the node;
1543  *      inum    item number in S[h];
1544  *      mode    i - insert, p - paste;
1545  * Returns:     1 - schedule occurred; 
1546  *              0 - balancing for higher levels needed;
1547  *             -1 - no balancing for higher levels needed;
1548  *             -2 - no disk space.
1549  *
1550  * Note: Items of internal nodes have fixed size, so the balance condition for
1551  * the internal part of S+tree is as for the B-trees.
1552  */
1553 static int dc_check_balance_internal(struct tree_balance *tb, int h)
1554 {
1555         struct virtual_node *vn = tb->tb_vn;
1556
1557         /* Sh is the node whose balance is currently being checked,
1558            and Fh is its father.  */
1559         struct buffer_head *Sh, *Fh;
1560         int maxsize, n_ret_value;
1561         int lfree, rfree /* free space in L and R */ ;
1562
1563         Sh = PATH_H_PBUFFER(tb->tb_path, h);
1564         Fh = PATH_H_PPARENT(tb->tb_path, h);
1565
1566         maxsize = MAX_CHILD_SIZE(Sh);
1567
1568 /*   using tb->insert_size[h], which is negative in this case, create_virtual_node calculates: */
1569 /*   new_nr_item = number of items node would have if operation is */
1570 /*      performed without balancing (new_nr_item); */
1571         create_virtual_node(tb, h);
1572
1573         if (!Fh) {              /* S[h] is the root. */
1574                 if (vn->vn_nr_item > 0) {
1575                         set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1576                         return NO_BALANCING_NEEDED;     /* no balancing for higher levels needed */
1577                 }
1578                 /* new_nr_item == 0.
1579                  * Current root will be deleted resulting in
1580                  * decrementing the tree height. */
1581                 set_parameters(tb, h, 0, 0, 0, NULL, -1, -1);
1582                 return CARRY_ON;
1583         }
1584
1585         if ((n_ret_value = get_parents(tb, h)) != CARRY_ON)
1586                 return n_ret_value;
1587
1588         /* get free space of neighbors */
1589         rfree = get_rfree(tb, h);
1590         lfree = get_lfree(tb, h);
1591
1592         /* determine maximal number of items we can fit into neighbors */
1593         check_left(tb, h, lfree);
1594         check_right(tb, h, rfree);
1595
1596         if (vn->vn_nr_item >= MIN_NR_KEY(Sh)) { /* Balance condition for the internal node is valid.
1597                                                  * In this case we balance only if it leads to better packing. */
1598                 if (vn->vn_nr_item == MIN_NR_KEY(Sh)) { /* Here we join S[h] with one of its neighbors,
1599                                                          * which is impossible with greater values of new_nr_item. */
1600                         if (tb->lnum[h] >= vn->vn_nr_item + 1) {
1601                                 /* All contents of S[h] can be moved to L[h]. */
1602                                 int n;
1603                                 int order_L;
1604
1605                                 order_L =
1606                                     ((n =
1607                                       PATH_H_B_ITEM_ORDER(tb->tb_path,
1608                                                           h)) ==
1609                                      0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
1610                                 n = dc_size(B_N_CHILD(tb->FL[h], order_L)) /
1611                                     (DC_SIZE + KEY_SIZE);
1612                                 set_parameters(tb, h, -n - 1, 0, 0, NULL, -1,
1613                                                -1);
1614                                 return CARRY_ON;
1615                         }
1616
1617                         if (tb->rnum[h] >= vn->vn_nr_item + 1) {
1618                                 /* All contents of S[h] can be moved to R[h]. */
1619                                 int n;
1620                                 int order_R;
1621
1622                                 order_R =
1623                                     ((n =
1624                                       PATH_H_B_ITEM_ORDER(tb->tb_path,
1625                                                           h)) ==
1626                                      B_NR_ITEMS(Fh)) ? 0 : n + 1;
1627                                 n = dc_size(B_N_CHILD(tb->FR[h], order_R)) /
1628                                     (DC_SIZE + KEY_SIZE);
1629                                 set_parameters(tb, h, 0, -n - 1, 0, NULL, -1,
1630                                                -1);
1631                                 return CARRY_ON;
1632                         }
1633                 }
1634
1635                 if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) {
1636                         /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */
1637                         int to_r;
1638
1639                         to_r =
1640                             ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] -
1641                              tb->rnum[h] + vn->vn_nr_item + 1) / 2 -
1642                             (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]);
1643                         set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r,
1644                                        0, NULL, -1, -1);
1645                         return CARRY_ON;
1646                 }
1647
1648                 /* Balancing does not lead to better packing. */
1649                 set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1650                 return NO_BALANCING_NEEDED;
1651         }
1652
1653         /* Current node contain insufficient number of items. Balancing is required. */
1654         /* Check whether we can merge S[h] with left neighbor. */
1655         if (tb->lnum[h] >= vn->vn_nr_item + 1)
1656                 if (is_left_neighbor_in_cache(tb, h)
1657                     || tb->rnum[h] < vn->vn_nr_item + 1 || !tb->FR[h]) {
1658                         int n;
1659                         int order_L;
1660
1661                         order_L =
1662                             ((n =
1663                               PATH_H_B_ITEM_ORDER(tb->tb_path,
1664                                                   h)) ==
1665                              0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
1666                         n = dc_size(B_N_CHILD(tb->FL[h], order_L)) / (DC_SIZE +
1667                                                                       KEY_SIZE);
1668                         set_parameters(tb, h, -n - 1, 0, 0, NULL, -1, -1);
1669                         return CARRY_ON;
1670                 }
1671
1672         /* Check whether we can merge S[h] with right neighbor. */
1673         if (tb->rnum[h] >= vn->vn_nr_item + 1) {
1674                 int n;
1675                 int order_R;
1676
1677                 order_R =
1678                     ((n =
1679                       PATH_H_B_ITEM_ORDER(tb->tb_path,
1680                                           h)) == B_NR_ITEMS(Fh)) ? 0 : (n + 1);
1681                 n = dc_size(B_N_CHILD(tb->FR[h], order_R)) / (DC_SIZE +
1682                                                               KEY_SIZE);
1683                 set_parameters(tb, h, 0, -n - 1, 0, NULL, -1, -1);
1684                 return CARRY_ON;
1685         }
1686
1687         /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */
1688         if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) {
1689                 int to_r;
1690
1691                 to_r =
1692                     ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] +
1693                      vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 -
1694                                                 tb->rnum[h]);
1695                 set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL,
1696                                -1, -1);
1697                 return CARRY_ON;
1698         }
1699
1700         /* For internal nodes try to borrow item from a neighbor */
1701         RFALSE(!tb->FL[h] && !tb->FR[h], "vs-8235: trying to borrow for root");
1702
1703         /* Borrow one or two items from caching neighbor */
1704         if (is_left_neighbor_in_cache(tb, h) || !tb->FR[h]) {
1705                 int from_l;
1706
1707                 from_l =
1708                     (MAX_NR_KEY(Sh) + 1 - tb->lnum[h] + vn->vn_nr_item +
1709                      1) / 2 - (vn->vn_nr_item + 1);
1710                 set_parameters(tb, h, -from_l, 0, 1, NULL, -1, -1);
1711                 return CARRY_ON;
1712         }
1713
1714         set_parameters(tb, h, 0,
1715                        -((MAX_NR_KEY(Sh) + 1 - tb->rnum[h] + vn->vn_nr_item +
1716                           1) / 2 - (vn->vn_nr_item + 1)), 1, NULL, -1, -1);
1717         return CARRY_ON;
1718 }
1719
1720 /* Check whether current node S[h] is balanced when Decreasing its size by
1721  * Deleting or Truncating for LEAF node of S+tree.
1722  * Calculate parameters for balancing for current level h.
1723  * Parameters:
1724  *      tb      tree_balance structure;
1725  *      h       current level of the node;
1726  *      inum    item number in S[h];
1727  *      mode    i - insert, p - paste;
1728  * Returns:     1 - schedule occurred; 
1729  *              0 - balancing for higher levels needed;
1730  *             -1 - no balancing for higher levels needed;
1731  *             -2 - no disk space.
1732  */
1733 static int dc_check_balance_leaf(struct tree_balance *tb, int h)
1734 {
1735         struct virtual_node *vn = tb->tb_vn;
1736
1737         /* Number of bytes that must be deleted from
1738            (value is negative if bytes are deleted) buffer which
1739            contains node being balanced.  The mnemonic is that the
1740            attempted change in node space used level is levbytes bytes. */
1741         int levbytes;
1742         /* the maximal item size */
1743         int maxsize, n_ret_value;
1744         /* S0 is the node whose balance is currently being checked,
1745            and F0 is its father.  */
1746         struct buffer_head *S0, *F0;
1747         int lfree, rfree /* free space in L and R */ ;
1748
1749         S0 = PATH_H_PBUFFER(tb->tb_path, 0);
1750         F0 = PATH_H_PPARENT(tb->tb_path, 0);
1751
1752         levbytes = tb->insert_size[h];
1753
1754         maxsize = MAX_CHILD_SIZE(S0);   /* maximal possible size of an item */
1755
1756         if (!F0) {              /* S[0] is the root now. */
1757
1758                 RFALSE(-levbytes >= maxsize - B_FREE_SPACE(S0),
1759                        "vs-8240: attempt to create empty buffer tree");
1760
1761                 set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1762                 return NO_BALANCING_NEEDED;
1763         }
1764
1765         if ((n_ret_value = get_parents(tb, h)) != CARRY_ON)
1766                 return n_ret_value;
1767
1768         /* get free space of neighbors */
1769         rfree = get_rfree(tb, h);
1770         lfree = get_lfree(tb, h);
1771
1772         create_virtual_node(tb, h);
1773
1774         /* if 3 leaves can be merge to one, set parameters and return */
1775         if (are_leaves_removable(tb, lfree, rfree))
1776                 return CARRY_ON;
1777
1778         /* determine maximal number of items we can shift to the left/right  neighbor
1779            and the maximal number of bytes that can flow to the left/right neighbor
1780            from the left/right most liquid item that cannot be shifted from S[0] entirely
1781          */
1782         check_left(tb, h, lfree);
1783         check_right(tb, h, rfree);
1784
1785         /* check whether we can merge S with left neighbor. */
1786         if (tb->lnum[0] >= vn->vn_nr_item && tb->lbytes == -1)
1787                 if (is_left_neighbor_in_cache(tb, h) || ((tb->rnum[0] - ((tb->rbytes == -1) ? 0 : 1)) < vn->vn_nr_item) ||      /* S can not be merged with R */
1788                     !tb->FR[h]) {
1789
1790                         RFALSE(!tb->FL[h],
1791                                "vs-8245: dc_check_balance_leaf: FL[h] must exist");
1792
1793                         /* set parameter to merge S[0] with its left neighbor */
1794                         set_parameters(tb, h, -1, 0, 0, NULL, -1, -1);
1795                         return CARRY_ON;
1796                 }
1797
1798         /* check whether we can merge S[0] with right neighbor. */
1799         if (tb->rnum[0] >= vn->vn_nr_item && tb->rbytes == -1) {
1800                 set_parameters(tb, h, 0, -1, 0, NULL, -1, -1);
1801                 return CARRY_ON;
1802         }
1803
1804         /* All contents of S[0] can be moved to the neighbors (L[0] & R[0]). Set parameters and return */
1805         if (is_leaf_removable(tb))
1806                 return CARRY_ON;
1807
1808         /* Balancing is not required. */
1809         tb->s0num = vn->vn_nr_item;
1810         set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1811         return NO_BALANCING_NEEDED;
1812 }
1813
1814 /* Check whether current node S[h] is balanced when Decreasing its size by
1815  * Deleting or Cutting.
1816  * Calculate parameters for balancing for current level h.
1817  * Parameters:
1818  *      tb      tree_balance structure;
1819  *      h       current level of the node;
1820  *      inum    item number in S[h];
1821  *      mode    d - delete, c - cut.
1822  * Returns:     1 - schedule occurred; 
1823  *              0 - balancing for higher levels needed;
1824  *             -1 - no balancing for higher levels needed;
1825  *             -2 - no disk space.
1826  */
1827 static int dc_check_balance(struct tree_balance *tb, int h)
1828 {
1829         RFALSE(!(PATH_H_PBUFFER(tb->tb_path, h)),
1830                "vs-8250: S is not initialized");
1831
1832         if (h)
1833                 return dc_check_balance_internal(tb, h);
1834         else
1835                 return dc_check_balance_leaf(tb, h);
1836 }
1837
1838 /* Check whether current node S[h] is balanced.
1839  * Calculate parameters for balancing for current level h.
1840  * Parameters:
1841  *
1842  *      tb      tree_balance structure:
1843  *
1844  *              tb is a large structure that must be read about in the header file
1845  *              at the same time as this procedure if the reader is to successfully
1846  *              understand this procedure
1847  *
1848  *      h       current level of the node;
1849  *      inum    item number in S[h];
1850  *      mode    i - insert, p - paste, d - delete, c - cut.
1851  * Returns:     1 - schedule occurred; 
1852  *              0 - balancing for higher levels needed;
1853  *             -1 - no balancing for higher levels needed;
1854  *             -2 - no disk space.
1855  */
1856 static int check_balance(int mode,
1857                          struct tree_balance *tb,
1858                          int h,
1859                          int inum,
1860                          int pos_in_item,
1861                          struct item_head *ins_ih, const void *data)
1862 {
1863         struct virtual_node *vn;
1864
1865         vn = tb->tb_vn = (struct virtual_node *)(tb->vn_buf);
1866         vn->vn_free_ptr = (char *)(tb->tb_vn + 1);
1867         vn->vn_mode = mode;
1868         vn->vn_affected_item_num = inum;
1869         vn->vn_pos_in_item = pos_in_item;
1870         vn->vn_ins_ih = ins_ih;
1871         vn->vn_data = data;
1872
1873         RFALSE(mode == M_INSERT && !vn->vn_ins_ih,
1874                "vs-8255: ins_ih can not be 0 in insert mode");
1875
1876         if (tb->insert_size[h] > 0)
1877                 /* Calculate balance parameters when size of node is increasing. */
1878                 return ip_check_balance(tb, h);
1879
1880         /* Calculate balance parameters when  size of node is decreasing. */
1881         return dc_check_balance(tb, h);
1882 }
1883
1884 /* Check whether parent at the path is the really parent of the current node.*/
1885 static int get_direct_parent(struct tree_balance *p_s_tb, int n_h)
1886 {
1887         struct buffer_head *p_s_bh;
1888         struct treepath *p_s_path = p_s_tb->tb_path;
1889         int n_position,
1890             n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h);
1891
1892         /* We are in the root or in the new root. */
1893         if (n_path_offset <= FIRST_PATH_ELEMENT_OFFSET) {
1894
1895                 RFALSE(n_path_offset < FIRST_PATH_ELEMENT_OFFSET - 1,
1896                        "PAP-8260: invalid offset in the path");
1897
1898                 if (PATH_OFFSET_PBUFFER(p_s_path, FIRST_PATH_ELEMENT_OFFSET)->
1899                     b_blocknr == SB_ROOT_BLOCK(p_s_tb->tb_sb)) {
1900                         /* Root is not changed. */
1901                         PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1) = NULL;
1902                         PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1) = 0;
1903                         return CARRY_ON;
1904                 }
1905                 return REPEAT_SEARCH;   /* Root is changed and we must recalculate the path. */
1906         }
1907
1908         if (!B_IS_IN_TREE
1909             (p_s_bh = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1)))
1910                 return REPEAT_SEARCH;   /* Parent in the path is not in the tree. */
1911
1912         if ((n_position =
1913              PATH_OFFSET_POSITION(p_s_path,
1914                                   n_path_offset - 1)) > B_NR_ITEMS(p_s_bh))
1915                 return REPEAT_SEARCH;
1916
1917         if (B_N_CHILD_NUM(p_s_bh, n_position) !=
1918             PATH_OFFSET_PBUFFER(p_s_path, n_path_offset)->b_blocknr)
1919                 /* Parent in the path is not parent of the current node in the tree. */
1920                 return REPEAT_SEARCH;
1921
1922         if (buffer_locked(p_s_bh)) {
1923                 __wait_on_buffer(p_s_bh);
1924                 if (FILESYSTEM_CHANGED_TB(p_s_tb))
1925                         return REPEAT_SEARCH;
1926         }
1927
1928         return CARRY_ON;        /* Parent in the path is unlocked and really parent of the current node.  */
1929 }
1930
1931 /* Using lnum[n_h] and rnum[n_h] we should determine what neighbors
1932  * of S[n_h] we
1933  * need in order to balance S[n_h], and get them if necessary.
1934  * Returns:     SCHEDULE_OCCURRED - schedule occurred while the function worked;
1935  *              CARRY_ON - schedule didn't occur while the function worked;
1936  */
1937 static int get_neighbors(struct tree_balance *p_s_tb, int n_h)
1938 {
1939         int n_child_position,
1940             n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h + 1);
1941         unsigned long n_son_number;
1942         struct super_block *p_s_sb = p_s_tb->tb_sb;
1943         struct buffer_head *p_s_bh;
1944
1945         PROC_INFO_INC(p_s_sb, get_neighbors[n_h]);
1946
1947         if (p_s_tb->lnum[n_h]) {
1948                 /* We need left neighbor to balance S[n_h]. */
1949                 PROC_INFO_INC(p_s_sb, need_l_neighbor[n_h]);
1950                 p_s_bh = PATH_OFFSET_PBUFFER(p_s_tb->tb_path, n_path_offset);
1951
1952                 RFALSE(p_s_bh == p_s_tb->FL[n_h] &&
1953                        !PATH_OFFSET_POSITION(p_s_tb->tb_path, n_path_offset),
1954                        "PAP-8270: invalid position in the parent");
1955
1956                 n_child_position =
1957                     (p_s_bh ==
1958                      p_s_tb->FL[n_h]) ? p_s_tb->lkey[n_h] : B_NR_ITEMS(p_s_tb->
1959                                                                        FL[n_h]);
1960                 n_son_number = B_N_CHILD_NUM(p_s_tb->FL[n_h], n_child_position);
1961                 p_s_bh = sb_bread(p_s_sb, n_son_number);
1962                 if (!p_s_bh)
1963                         return IO_ERROR;
1964                 if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
1965                         decrement_bcount(p_s_bh);
1966                         PROC_INFO_INC(p_s_sb, get_neighbors_restart[n_h]);
1967                         return REPEAT_SEARCH;
1968                 }
1969
1970                 RFALSE(!B_IS_IN_TREE(p_s_tb->FL[n_h]) ||
1971                        n_child_position > B_NR_ITEMS(p_s_tb->FL[n_h]) ||
1972                        B_N_CHILD_NUM(p_s_tb->FL[n_h], n_child_position) !=
1973                        p_s_bh->b_blocknr, "PAP-8275: invalid parent");
1974                 RFALSE(!B_IS_IN_TREE(p_s_bh), "PAP-8280: invalid child");
1975                 RFALSE(!n_h &&
1976                        B_FREE_SPACE(p_s_bh) !=
1977                        MAX_CHILD_SIZE(p_s_bh) -
1978                        dc_size(B_N_CHILD(p_s_tb->FL[0], n_child_position)),
1979                        "PAP-8290: invalid child size of left neighbor");
1980
1981                 decrement_bcount(p_s_tb->L[n_h]);
1982                 p_s_tb->L[n_h] = p_s_bh;
1983         }
1984
1985         if (p_s_tb->rnum[n_h]) {        /* We need right neighbor to balance S[n_path_offset]. */
1986                 PROC_INFO_INC(p_s_sb, need_r_neighbor[n_h]);
1987                 p_s_bh = PATH_OFFSET_PBUFFER(p_s_tb->tb_path, n_path_offset);
1988
1989                 RFALSE(p_s_bh == p_s_tb->FR[n_h] &&
1990                        PATH_OFFSET_POSITION(p_s_tb->tb_path,
1991                                             n_path_offset) >=
1992                        B_NR_ITEMS(p_s_bh),
1993                        "PAP-8295: invalid position in the parent");
1994
1995                 n_child_position =
1996                     (p_s_bh == p_s_tb->FR[n_h]) ? p_s_tb->rkey[n_h] + 1 : 0;
1997                 n_son_number = B_N_CHILD_NUM(p_s_tb->FR[n_h], n_child_position);
1998                 p_s_bh = sb_bread(p_s_sb, n_son_number);
1999                 if (!p_s_bh)
2000                         return IO_ERROR;
2001                 if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
2002                         decrement_bcount(p_s_bh);
2003                         PROC_INFO_INC(p_s_sb, get_neighbors_restart[n_h]);
2004                         return REPEAT_SEARCH;
2005                 }
2006                 decrement_bcount(p_s_tb->R[n_h]);
2007                 p_s_tb->R[n_h] = p_s_bh;
2008
2009                 RFALSE(!n_h
2010                        && B_FREE_SPACE(p_s_bh) !=
2011                        MAX_CHILD_SIZE(p_s_bh) -
2012                        dc_size(B_N_CHILD(p_s_tb->FR[0], n_child_position)),
2013                        "PAP-8300: invalid child size of right neighbor (%d != %d - %d)",
2014                        B_FREE_SPACE(p_s_bh), MAX_CHILD_SIZE(p_s_bh),
2015                        dc_size(B_N_CHILD(p_s_tb->FR[0], n_child_position)));
2016
2017         }
2018         return CARRY_ON;
2019 }
2020
2021 static int get_virtual_node_size(struct super_block *sb, struct buffer_head *bh)
2022 {
2023         int max_num_of_items;
2024         int max_num_of_entries;
2025         unsigned long blocksize = sb->s_blocksize;
2026
2027 #define MIN_NAME_LEN 1
2028
2029         max_num_of_items = (blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN);
2030         max_num_of_entries = (blocksize - BLKH_SIZE - IH_SIZE) /
2031             (DEH_SIZE + MIN_NAME_LEN);
2032
2033         return sizeof(struct virtual_node) +
2034             max(max_num_of_items * sizeof(struct virtual_item),
2035                 sizeof(struct virtual_item) + sizeof(struct direntry_uarea) +
2036                 (max_num_of_entries - 1) * sizeof(__u16));
2037 }
2038
2039 /* maybe we should fail balancing we are going to perform when kmalloc
2040    fails several times. But now it will loop until kmalloc gets
2041    required memory */
2042 static int get_mem_for_virtual_node(struct tree_balance *tb)
2043 {
2044         int check_fs = 0;
2045         int size;
2046         char *buf;
2047
2048         size = get_virtual_node_size(tb->tb_sb, PATH_PLAST_BUFFER(tb->tb_path));
2049
2050         if (size > tb->vn_buf_size) {
2051                 /* we have to allocate more memory for virtual node */
2052                 if (tb->vn_buf) {
2053                         /* free memory allocated before */
2054                         kfree(tb->vn_buf);
2055                         /* this is not needed if kfree is atomic */
2056                         check_fs = 1;
2057                 }
2058
2059                 /* virtual node requires now more memory */
2060                 tb->vn_buf_size = size;
2061
2062                 /* get memory for virtual item */
2063                 buf = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN);
2064                 if (!buf) {
2065                         /* getting memory with GFP_KERNEL priority may involve
2066                            balancing now (due to indirect_to_direct conversion on
2067                            dcache shrinking). So, release path and collected
2068                            resources here */
2069                         free_buffers_in_tb(tb);
2070                         buf = kmalloc(size, GFP_NOFS);
2071                         if (!buf) {
2072                                 tb->vn_buf_size = 0;
2073                         }
2074                         tb->vn_buf = buf;
2075                         schedule();
2076                         return REPEAT_SEARCH;
2077                 }
2078
2079                 tb->vn_buf = buf;
2080         }
2081
2082         if (check_fs && FILESYSTEM_CHANGED_TB(tb))
2083                 return REPEAT_SEARCH;
2084
2085         return CARRY_ON;
2086 }
2087
2088 #ifdef CONFIG_REISERFS_CHECK
2089 static void tb_buffer_sanity_check(struct super_block *p_s_sb,
2090                                    struct buffer_head *p_s_bh,
2091                                    const char *descr, int level)
2092 {
2093         if (p_s_bh) {
2094                 if (atomic_read(&(p_s_bh->b_count)) <= 0) {
2095
2096                         reiserfs_panic(p_s_sb,
2097                                        "jmacd-1: tb_buffer_sanity_check(): negative or zero reference counter for buffer %s[%d] (%b)\n",
2098                                        descr, level, p_s_bh);
2099                 }
2100
2101                 if (!buffer_uptodate(p_s_bh)) {
2102                         reiserfs_panic(p_s_sb,
2103                                        "jmacd-2: tb_buffer_sanity_check(): buffer is not up to date %s[%d] (%b)\n",
2104                                        descr, level, p_s_bh);
2105                 }
2106
2107                 if (!B_IS_IN_TREE(p_s_bh)) {
2108                         reiserfs_panic(p_s_sb,
2109                                        "jmacd-3: tb_buffer_sanity_check(): buffer is not in tree %s[%d] (%b)\n",
2110                                        descr, level, p_s_bh);
2111                 }
2112
2113                 if (p_s_bh->b_bdev != p_s_sb->s_bdev) {
2114                         reiserfs_panic(p_s_sb,
2115                                        "jmacd-4: tb_buffer_sanity_check(): buffer has wrong device %s[%d] (%b)\n",
2116                                        descr, level, p_s_bh);
2117                 }
2118
2119                 if (p_s_bh->b_size != p_s_sb->s_blocksize) {
2120                         reiserfs_panic(p_s_sb,
2121                                        "jmacd-5: tb_buffer_sanity_check(): buffer has wrong blocksize %s[%d] (%b)\n",
2122                                        descr, level, p_s_bh);
2123                 }
2124
2125                 if (p_s_bh->b_blocknr > SB_BLOCK_COUNT(p_s_sb)) {
2126                         reiserfs_panic(p_s_sb,
2127                                        "jmacd-6: tb_buffer_sanity_check(): buffer block number too high %s[%d] (%b)\n",
2128                                        descr, level, p_s_bh);
2129                 }
2130         }
2131 }
2132 #else
2133 static void tb_buffer_sanity_check(struct super_block *p_s_sb,
2134                                    struct buffer_head *p_s_bh,
2135                                    const char *descr, int level)
2136 {;
2137 }
2138 #endif
2139
2140 static int clear_all_dirty_bits(struct super_block *s, struct buffer_head *bh)
2141 {
2142         return reiserfs_prepare_for_journal(s, bh, 0);
2143 }
2144
2145 static int wait_tb_buffers_until_unlocked(struct tree_balance *p_s_tb)
2146 {
2147         struct buffer_head *locked;
2148 #ifdef CONFIG_REISERFS_CHECK
2149         int repeat_counter = 0;
2150 #endif
2151         int i;
2152
2153         do {
2154
2155                 locked = NULL;
2156
2157                 for (i = p_s_tb->tb_path->path_length;
2158                      !locked && i > ILLEGAL_PATH_ELEMENT_OFFSET; i--) {
2159                         if (PATH_OFFSET_PBUFFER(p_s_tb->tb_path, i)) {
2160                                 /* if I understand correctly, we can only be sure the last buffer
2161                                  ** in the path is in the tree --clm
2162                                  */
2163 #ifdef CONFIG_REISERFS_CHECK
2164                                 if (PATH_PLAST_BUFFER(p_s_tb->tb_path) ==
2165                                     PATH_OFFSET_PBUFFER(p_s_tb->tb_path, i)) {
2166                                         tb_buffer_sanity_check(p_s_tb->tb_sb,
2167                                                                PATH_OFFSET_PBUFFER
2168                                                                (p_s_tb->tb_path,
2169                                                                 i), "S",
2170                                                                p_s_tb->tb_path->
2171                                                                path_length - i);
2172                                 }
2173 #endif
2174                                 if (!clear_all_dirty_bits(p_s_tb->tb_sb,
2175                                                           PATH_OFFSET_PBUFFER
2176                                                           (p_s_tb->tb_path,
2177                                                            i))) {
2178                                         locked =
2179                                             PATH_OFFSET_PBUFFER(p_s_tb->tb_path,
2180                                                                 i);
2181                                 }
2182                         }
2183                 }
2184
2185                 for (i = 0; !locked && i < MAX_HEIGHT && p_s_tb->insert_size[i];
2186                      i++) {
2187
2188                         if (p_s_tb->lnum[i]) {
2189
2190                                 if (p_s_tb->L[i]) {
2191                                         tb_buffer_sanity_check(p_s_tb->tb_sb,
2192                                                                p_s_tb->L[i],
2193                                                                "L", i);
2194                                         if (!clear_all_dirty_bits
2195                                             (p_s_tb->tb_sb, p_s_tb->L[i]))
2196                                                 locked = p_s_tb->L[i];
2197                                 }
2198
2199                                 if (!locked && p_s_tb->FL[i]) {
2200                                         tb_buffer_sanity_check(p_s_tb->tb_sb,
2201                                                                p_s_tb->FL[i],
2202                                                                "FL", i);
2203                                         if (!clear_all_dirty_bits
2204                                             (p_s_tb->tb_sb, p_s_tb->FL[i]))
2205                                                 locked = p_s_tb->FL[i];
2206                                 }
2207
2208                                 if (!locked && p_s_tb->CFL[i]) {
2209                                         tb_buffer_sanity_check(p_s_tb->tb_sb,
2210                                                                p_s_tb->CFL[i],
2211                                                                "CFL", i);
2212                                         if (!clear_all_dirty_bits
2213                                             (p_s_tb->tb_sb, p_s_tb->CFL[i]))
2214                                                 locked = p_s_tb->CFL[i];
2215                                 }
2216
2217                         }
2218
2219                         if (!locked && (p_s_tb->rnum[i])) {
2220
2221                                 if (p_s_tb->R[i]) {
2222                                         tb_buffer_sanity_check(p_s_tb->tb_sb,
2223                                                                p_s_tb->R[i],
2224                                                                "R", i);
2225                                         if (!clear_all_dirty_bits
2226                                             (p_s_tb->tb_sb, p_s_tb->R[i]))
2227                                                 locked = p_s_tb->R[i];
2228                                 }
2229
2230                                 if (!locked && p_s_tb->FR[i]) {
2231                                         tb_buffer_sanity_check(p_s_tb->tb_sb,
2232                                                                p_s_tb->FR[i],
2233                                                                "FR", i);
2234                                         if (!clear_all_dirty_bits
2235                                             (p_s_tb->tb_sb, p_s_tb->FR[i]))
2236                                                 locked = p_s_tb->FR[i];
2237                                 }
2238
2239                                 if (!locked && p_s_tb->CFR[i]) {
2240                                         tb_buffer_sanity_check(p_s_tb->tb_sb,
2241                                                                p_s_tb->CFR[i],
2242                                                                "CFR", i);
2243                                         if (!clear_all_dirty_bits
2244                                             (p_s_tb->tb_sb, p_s_tb->CFR[i]))
2245                                                 locked = p_s_tb->CFR[i];
2246                                 }
2247                         }
2248                 }
2249                 /* as far as I can tell, this is not required.  The FEB list seems
2250                  ** to be full of newly allocated nodes, which will never be locked,
2251                  ** dirty, or anything else.
2252                  ** To be safe, I'm putting in the checks and waits in.  For the moment,
2253                  ** they are needed to keep the code in journal.c from complaining
2254                  ** about the buffer.  That code is inside CONFIG_REISERFS_CHECK as well.
2255                  ** --clm
2256                  */
2257                 for (i = 0; !locked && i < MAX_FEB_SIZE; i++) {
2258                         if (p_s_tb->FEB[i]) {
2259                                 if (!clear_all_dirty_bits
2260                                     (p_s_tb->tb_sb, p_s_tb->FEB[i]))
2261                                         locked = p_s_tb->FEB[i];
2262                         }
2263                 }
2264
2265                 if (locked) {
2266 #ifdef CONFIG_REISERFS_CHECK
2267                         repeat_counter++;
2268                         if ((repeat_counter % 10000) == 0) {
2269                                 reiserfs_warning(p_s_tb->tb_sb,
2270                                                  "wait_tb_buffers_until_released(): too many "
2271                                                  "iterations waiting for buffer to unlock "
2272                                                  "(%b)", locked);
2273
2274                                 /* Don't loop forever.  Try to recover from possible error. */
2275
2276                                 return (FILESYSTEM_CHANGED_TB(p_s_tb)) ?
2277                                     REPEAT_SEARCH : CARRY_ON;
2278                         }
2279 #endif
2280                         __wait_on_buffer(locked);
2281                         if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
2282                                 return REPEAT_SEARCH;
2283                         }
2284                 }
2285
2286         } while (locked);
2287
2288         return CARRY_ON;
2289 }
2290
2291 /* Prepare for balancing, that is
2292  *      get all necessary parents, and neighbors;
2293  *      analyze what and where should be moved;
2294  *      get sufficient number of new nodes;
2295  * Balancing will start only after all resources will be collected at a time.
2296  * 
2297  * When ported to SMP kernels, only at the last moment after all needed nodes
2298  * are collected in cache, will the resources be locked using the usual
2299  * textbook ordered lock acquisition algorithms.  Note that ensuring that
2300  * this code neither write locks what it does not need to write lock nor locks out of order
2301  * will be a pain in the butt that could have been avoided.  Grumble grumble. -Hans
2302  * 
2303  * fix is meant in the sense of render unchanging
2304  * 
2305  * Latency might be improved by first gathering a list of what buffers are needed
2306  * and then getting as many of them in parallel as possible? -Hans
2307  *
2308  * Parameters:
2309  *      op_mode i - insert, d - delete, c - cut (truncate), p - paste (append)
2310  *      tb      tree_balance structure;
2311  *      inum    item number in S[h];
2312  *      pos_in_item - comment this if you can
2313  *      ins_ih & ins_sd are used when inserting
2314  * Returns:     1 - schedule occurred while the function worked;
2315  *              0 - schedule didn't occur while the function worked;
2316  *             -1 - if no_disk_space 
2317  */
2318
2319 int fix_nodes(int n_op_mode, struct tree_balance *p_s_tb, struct item_head *p_s_ins_ih, // item head of item being inserted
2320               const void *data  // inserted item or data to be pasted
2321     )
2322 {
2323         int n_ret_value, n_h, n_item_num = PATH_LAST_POSITION(p_s_tb->tb_path);
2324         int n_pos_in_item;
2325
2326         /* we set wait_tb_buffers_run when we have to restore any dirty bits cleared
2327          ** during wait_tb_buffers_run
2328          */
2329         int wait_tb_buffers_run = 0;
2330         struct buffer_head *p_s_tbS0 = PATH_PLAST_BUFFER(p_s_tb->tb_path);
2331
2332         ++REISERFS_SB(p_s_tb->tb_sb)->s_fix_nodes;
2333
2334         n_pos_in_item = p_s_tb->tb_path->pos_in_item;
2335
2336         p_s_tb->fs_gen = get_generation(p_s_tb->tb_sb);
2337
2338         /* we prepare and log the super here so it will already be in the
2339          ** transaction when do_balance needs to change it.
2340          ** This way do_balance won't have to schedule when trying to prepare
2341          ** the super for logging
2342          */
2343         reiserfs_prepare_for_journal(p_s_tb->tb_sb,
2344                                      SB_BUFFER_WITH_SB(p_s_tb->tb_sb), 1);
2345         journal_mark_dirty(p_s_tb->transaction_handle, p_s_tb->tb_sb,
2346                            SB_BUFFER_WITH_SB(p_s_tb->tb_sb));
2347         if (FILESYSTEM_CHANGED_TB(p_s_tb))
2348                 return REPEAT_SEARCH;
2349
2350         /* if it possible in indirect_to_direct conversion */
2351         if (buffer_locked(p_s_tbS0)) {
2352                 __wait_on_buffer(p_s_tbS0);
2353                 if (FILESYSTEM_CHANGED_TB(p_s_tb))
2354                         return REPEAT_SEARCH;
2355         }
2356 #ifdef CONFIG_REISERFS_CHECK
2357         if (cur_tb) {
2358                 print_cur_tb("fix_nodes");
2359                 reiserfs_panic(p_s_tb->tb_sb,
2360                                "PAP-8305: fix_nodes:  there is pending do_balance");
2361         }
2362
2363         if (!buffer_uptodate(p_s_tbS0) || !B_IS_IN_TREE(p_s_tbS0)) {
2364                 reiserfs_panic(p_s_tb->tb_sb,
2365                                "PAP-8320: fix_nodes: S[0] (%b %z) is not uptodate "
2366                                "at the beginning of fix_nodes or not in tree (mode %c)",
2367                                p_s_tbS0, p_s_tbS0, n_op_mode);
2368         }
2369
2370         /* Check parameters. */
2371         switch (n_op_mode) {
2372         case M_INSERT:
2373                 if (n_item_num <= 0 || n_item_num > B_NR_ITEMS(p_s_tbS0))
2374                         reiserfs_panic(p_s_tb->tb_sb,
2375                                        "PAP-8330: fix_nodes: Incorrect item number %d (in S0 - %d) in case of insert",
2376                                        n_item_num, B_NR_ITEMS(p_s_tbS0));
2377                 break;
2378         case M_PASTE:
2379         case M_DELETE:
2380         case M_CUT:
2381                 if (n_item_num < 0 || n_item_num >= B_NR_ITEMS(p_s_tbS0)) {
2382                         print_block(p_s_tbS0, 0, -1, -1);
2383                         reiserfs_panic(p_s_tb->tb_sb,
2384                                        "PAP-8335: fix_nodes: Incorrect item number(%d); mode = %c insert_size = %d\n",
2385                                        n_item_num, n_op_mode,
2386                                        p_s_tb->insert_size[0]);
2387                 }
2388                 break;
2389         default:
2390                 reiserfs_panic(p_s_tb->tb_sb,
2391                                "PAP-8340: fix_nodes: Incorrect mode of operation");
2392         }
2393 #endif
2394
2395         if (get_mem_for_virtual_node(p_s_tb) == REPEAT_SEARCH)
2396                 // FIXME: maybe -ENOMEM when tb->vn_buf == 0? Now just repeat
2397                 return REPEAT_SEARCH;
2398
2399         /* Starting from the leaf level; for all levels n_h of the tree. */
2400         for (n_h = 0; n_h < MAX_HEIGHT && p_s_tb->insert_size[n_h]; n_h++) {
2401                 if ((n_ret_value = get_direct_parent(p_s_tb, n_h)) != CARRY_ON) {
2402                         goto repeat;
2403                 }
2404
2405                 if ((n_ret_value =
2406                      check_balance(n_op_mode, p_s_tb, n_h, n_item_num,
2407                                    n_pos_in_item, p_s_ins_ih,
2408                                    data)) != CARRY_ON) {
2409                         if (n_ret_value == NO_BALANCING_NEEDED) {
2410                                 /* No balancing for higher levels needed. */
2411                                 if ((n_ret_value =
2412                                      get_neighbors(p_s_tb, n_h)) != CARRY_ON) {
2413                                         goto repeat;
2414                                 }
2415                                 if (n_h != MAX_HEIGHT - 1)
2416                                         p_s_tb->insert_size[n_h + 1] = 0;
2417                                 /* ok, analysis and resource gathering are complete */
2418                                 break;
2419                         }
2420                         goto repeat;
2421                 }
2422
2423                 if ((n_ret_value = get_neighbors(p_s_tb, n_h)) != CARRY_ON) {
2424                         goto repeat;
2425                 }
2426
2427                 if ((n_ret_value = get_empty_nodes(p_s_tb, n_h)) != CARRY_ON) {
2428                         goto repeat;    /* No disk space, or schedule occurred and
2429                                            analysis may be invalid and needs to be redone. */
2430                 }
2431
2432                 if (!PATH_H_PBUFFER(p_s_tb->tb_path, n_h)) {
2433                         /* We have a positive insert size but no nodes exist on this
2434                            level, this means that we are creating a new root. */
2435
2436                         RFALSE(p_s_tb->blknum[n_h] != 1,
2437                                "PAP-8350: creating new empty root");
2438
2439                         if (n_h < MAX_HEIGHT - 1)
2440                                 p_s_tb->insert_size[n_h + 1] = 0;
2441                 } else if (!PATH_H_PBUFFER(p_s_tb->tb_path, n_h + 1)) {
2442                         if (p_s_tb->blknum[n_h] > 1) {
2443                                 /* The tree needs to be grown, so this node S[n_h]
2444                                    which is the root node is split into two nodes,
2445                                    and a new node (S[n_h+1]) will be created to
2446                                    become the root node.  */
2447
2448                                 RFALSE(n_h == MAX_HEIGHT - 1,
2449                                        "PAP-8355: attempt to create too high of a tree");
2450
2451                                 p_s_tb->insert_size[n_h + 1] =
2452                                     (DC_SIZE +
2453                                      KEY_SIZE) * (p_s_tb->blknum[n_h] - 1) +
2454                                     DC_SIZE;
2455                         } else if (n_h < MAX_HEIGHT - 1)
2456                                 p_s_tb->insert_size[n_h + 1] = 0;
2457                 } else
2458                         p_s_tb->insert_size[n_h + 1] =
2459                             (DC_SIZE + KEY_SIZE) * (p_s_tb->blknum[n_h] - 1);
2460         }
2461
2462         if ((n_ret_value = wait_tb_buffers_until_unlocked(p_s_tb)) == CARRY_ON) {
2463                 if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
2464                         wait_tb_buffers_run = 1;
2465                         n_ret_value = REPEAT_SEARCH;
2466                         goto repeat;
2467                 } else {
2468                         return CARRY_ON;
2469                 }
2470         } else {
2471                 wait_tb_buffers_run = 1;
2472                 goto repeat;
2473         }
2474
2475       repeat:
2476         // fix_nodes was unable to perform its calculation due to
2477         // filesystem got changed under us, lack of free disk space or i/o
2478         // failure. If the first is the case - the search will be
2479         // repeated. For now - free all resources acquired so far except
2480         // for the new allocated nodes
2481         {
2482                 int i;
2483
2484                 /* Release path buffers. */
2485                 if (wait_tb_buffers_run) {
2486                         pathrelse_and_restore(p_s_tb->tb_sb, p_s_tb->tb_path);
2487                 } else {
2488                         pathrelse(p_s_tb->tb_path);
2489                 }
2490                 /* brelse all resources collected for balancing */
2491                 for (i = 0; i < MAX_HEIGHT; i++) {
2492                         if (wait_tb_buffers_run) {
2493                                 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
2494                                                                  p_s_tb->L[i]);
2495                                 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
2496                                                                  p_s_tb->R[i]);
2497                                 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
2498                                                                  p_s_tb->FL[i]);
2499                                 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
2500                                                                  p_s_tb->FR[i]);
2501                                 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
2502                                                                  p_s_tb->
2503                                                                  CFL[i]);
2504                                 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
2505                                                                  p_s_tb->
2506                                                                  CFR[i]);
2507                         }
2508
2509                         brelse(p_s_tb->L[i]);
2510                         p_s_tb->L[i] = NULL;
2511                         brelse(p_s_tb->R[i]);
2512                         p_s_tb->R[i] = NULL;
2513                         brelse(p_s_tb->FL[i]);
2514                         p_s_tb->FL[i] = NULL;
2515                         brelse(p_s_tb->FR[i]);
2516                         p_s_tb->FR[i] = NULL;
2517                         brelse(p_s_tb->CFL[i]);
2518                         p_s_tb->CFL[i] = NULL;
2519                         brelse(p_s_tb->CFR[i]);
2520                         p_s_tb->CFR[i] = NULL;
2521                 }
2522
2523                 if (wait_tb_buffers_run) {
2524                         for (i = 0; i < MAX_FEB_SIZE; i++) {
2525                                 if (p_s_tb->FEB[i]) {
2526                                         reiserfs_restore_prepared_buffer
2527                                             (p_s_tb->tb_sb, p_s_tb->FEB[i]);
2528                                 }
2529                         }
2530                 }
2531                 return n_ret_value;
2532         }
2533
2534 }
2535
2536 /* Anatoly will probably forgive me renaming p_s_tb to tb. I just
2537    wanted to make lines shorter */
2538 void unfix_nodes(struct tree_balance *tb)
2539 {
2540         int i;
2541
2542         /* Release path buffers. */
2543         pathrelse_and_restore(tb->tb_sb, tb->tb_path);
2544
2545         /* brelse all resources collected for balancing */
2546         for (i = 0; i < MAX_HEIGHT; i++) {
2547                 reiserfs_restore_prepared_buffer(tb->tb_sb, tb->L[i]);
2548                 reiserfs_restore_prepared_buffer(tb->tb_sb, tb->R[i]);
2549                 reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FL[i]);
2550                 reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FR[i]);
2551                 reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFL[i]);
2552                 reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFR[i]);
2553
2554                 brelse(tb->L[i]);
2555                 brelse(tb->R[i]);
2556                 brelse(tb->FL[i]);
2557                 brelse(tb->FR[i]);
2558                 brelse(tb->CFL[i]);
2559                 brelse(tb->CFR[i]);
2560         }
2561
2562         /* deal with list of allocated (used and unused) nodes */
2563         for (i = 0; i < MAX_FEB_SIZE; i++) {
2564                 if (tb->FEB[i]) {
2565                         b_blocknr_t blocknr = tb->FEB[i]->b_blocknr;
2566                         /* de-allocated block which was not used by balancing and
2567                            bforget about buffer for it */
2568                         brelse(tb->FEB[i]);
2569                         reiserfs_free_block(tb->transaction_handle, NULL,
2570                                             blocknr, 0);
2571                 }
2572                 if (tb->used[i]) {
2573                         /* release used as new nodes including a new root */
2574                         brelse(tb->used[i]);
2575                 }
2576         }
2577
2578         kfree(tb->vn_buf);
2579
2580 }