Merge git://git.kernel.org/pub/scm/linux/kernel/git/bart/ide-2.6
[linux-2.6] / fs / reiserfs / fix_node.c
1 /*
2  * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3  */
4
5 /**
6  ** old_item_num
7  ** old_entry_num
8  ** set_entry_sizes
9  ** create_virtual_node
10  ** check_left
11  ** check_right
12  ** directory_part_size
13  ** get_num_ver
14  ** set_parameters
15  ** is_leaf_removable
16  ** are_leaves_removable
17  ** get_empty_nodes
18  ** get_lfree
19  ** get_rfree
20  ** is_left_neighbor_in_cache
21  ** decrement_key
22  ** get_far_parent
23  ** get_parents
24  ** can_node_be_removed
25  ** ip_check_balance
26  ** dc_check_balance_internal
27  ** dc_check_balance_leaf
28  ** dc_check_balance
29  ** check_balance
30  ** get_direct_parent
31  ** get_neighbors
32  ** fix_nodes
33  **
34  **
35  **/
36
37 #include <linux/time.h>
38 #include <linux/string.h>
39 #include <linux/reiserfs_fs.h>
40 #include <linux/buffer_head.h>
41
42 /* To make any changes in the tree we find a node, that contains item
43    to be changed/deleted or position in the node we insert a new item
44    to. We call this node S. To do balancing we need to decide what we
45    will shift to left/right neighbor, or to a new node, where new item
46    will be etc. To make this analysis simpler we build virtual
47    node. Virtual node is an array of items, that will replace items of
48    node S. (For instance if we are going to delete an item, virtual
49    node does not contain it). Virtual node keeps information about
50    item sizes and types, mergeability of first and last items, sizes
51    of all entries in directory item. We use this array of items when
52    calculating what we can shift to neighbors and how many nodes we
53    have to have if we do not any shiftings, if we shift to left/right
54    neighbor or to both. */
55
56 /* taking item number in virtual node, returns number of item, that it has in source buffer */
57 static inline int old_item_num(int new_num, int affected_item_num, int mode)
58 {
59         if (mode == M_PASTE || mode == M_CUT || new_num < affected_item_num)
60                 return new_num;
61
62         if (mode == M_INSERT) {
63
64                 RFALSE(new_num == 0,
65                        "vs-8005: for INSERT mode and item number of inserted item");
66
67                 return new_num - 1;
68         }
69
70         RFALSE(mode != M_DELETE,
71                "vs-8010: old_item_num: mode must be M_DELETE (mode = \'%c\'",
72                mode);
73         /* delete mode */
74         return new_num + 1;
75 }
76
77 static void create_virtual_node(struct tree_balance *tb, int h)
78 {
79         struct item_head *ih;
80         struct virtual_node *vn = tb->tb_vn;
81         int new_num;
82         struct buffer_head *Sh; /* this comes from tb->S[h] */
83
84         Sh = PATH_H_PBUFFER(tb->tb_path, h);
85
86         /* size of changed node */
87         vn->vn_size =
88             MAX_CHILD_SIZE(Sh) - B_FREE_SPACE(Sh) + tb->insert_size[h];
89
90         /* for internal nodes array if virtual items is not created */
91         if (h) {
92                 vn->vn_nr_item = (vn->vn_size - DC_SIZE) / (DC_SIZE + KEY_SIZE);
93                 return;
94         }
95
96         /* number of items in virtual node  */
97         vn->vn_nr_item =
98             B_NR_ITEMS(Sh) + ((vn->vn_mode == M_INSERT) ? 1 : 0) -
99             ((vn->vn_mode == M_DELETE) ? 1 : 0);
100
101         /* first virtual item */
102         vn->vn_vi = (struct virtual_item *)(tb->tb_vn + 1);
103         memset(vn->vn_vi, 0, vn->vn_nr_item * sizeof(struct virtual_item));
104         vn->vn_free_ptr += vn->vn_nr_item * sizeof(struct virtual_item);
105
106         /* first item in the node */
107         ih = B_N_PITEM_HEAD(Sh, 0);
108
109         /* define the mergeability for 0-th item (if it is not being deleted) */
110         if (op_is_left_mergeable(&(ih->ih_key), Sh->b_size)
111             && (vn->vn_mode != M_DELETE || vn->vn_affected_item_num))
112                 vn->vn_vi[0].vi_type |= VI_TYPE_LEFT_MERGEABLE;
113
114         /* go through all items those remain in the virtual node (except for the new (inserted) one) */
115         for (new_num = 0; new_num < vn->vn_nr_item; new_num++) {
116                 int j;
117                 struct virtual_item *vi = vn->vn_vi + new_num;
118                 int is_affected =
119                     ((new_num != vn->vn_affected_item_num) ? 0 : 1);
120
121                 if (is_affected && vn->vn_mode == M_INSERT)
122                         continue;
123
124                 /* get item number in source node */
125                 j = old_item_num(new_num, vn->vn_affected_item_num,
126                                  vn->vn_mode);
127
128                 vi->vi_item_len += ih_item_len(ih + j) + IH_SIZE;
129                 vi->vi_ih = ih + j;
130                 vi->vi_item = B_I_PITEM(Sh, ih + j);
131                 vi->vi_uarea = vn->vn_free_ptr;
132
133                 // FIXME: there is no check, that item operation did not
134                 // consume too much memory
135                 vn->vn_free_ptr +=
136                     op_create_vi(vn, vi, is_affected, tb->insert_size[0]);
137                 if (tb->vn_buf + tb->vn_buf_size < vn->vn_free_ptr)
138                         reiserfs_panic(tb->tb_sb, "vs-8030",
139                                        "virtual node space consumed");
140
141                 if (!is_affected)
142                         /* this is not being changed */
143                         continue;
144
145                 if (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT) {
146                         vn->vn_vi[new_num].vi_item_len += tb->insert_size[0];
147                         vi->vi_new_data = vn->vn_data;  // pointer to data which is going to be pasted
148                 }
149         }
150
151         /* virtual inserted item is not defined yet */
152         if (vn->vn_mode == M_INSERT) {
153                 struct virtual_item *vi = vn->vn_vi + vn->vn_affected_item_num;
154
155                 RFALSE(vn->vn_ins_ih == NULL,
156                        "vs-8040: item header of inserted item is not specified");
157                 vi->vi_item_len = tb->insert_size[0];
158                 vi->vi_ih = vn->vn_ins_ih;
159                 vi->vi_item = vn->vn_data;
160                 vi->vi_uarea = vn->vn_free_ptr;
161
162                 op_create_vi(vn, vi, 0 /*not pasted or cut */ ,
163                              tb->insert_size[0]);
164         }
165
166         /* set right merge flag we take right delimiting key and check whether it is a mergeable item */
167         if (tb->CFR[0]) {
168                 struct reiserfs_key *key;
169
170                 key = B_N_PDELIM_KEY(tb->CFR[0], tb->rkey[0]);
171                 if (op_is_left_mergeable(key, Sh->b_size)
172                     && (vn->vn_mode != M_DELETE
173                         || vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1))
174                         vn->vn_vi[vn->vn_nr_item - 1].vi_type |=
175                             VI_TYPE_RIGHT_MERGEABLE;
176
177 #ifdef CONFIG_REISERFS_CHECK
178                 if (op_is_left_mergeable(key, Sh->b_size) &&
179                     !(vn->vn_mode != M_DELETE
180                       || vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1)) {
181                         /* we delete last item and it could be merged with right neighbor's first item */
182                         if (!
183                             (B_NR_ITEMS(Sh) == 1
184                              && is_direntry_le_ih(B_N_PITEM_HEAD(Sh, 0))
185                              && I_ENTRY_COUNT(B_N_PITEM_HEAD(Sh, 0)) == 1)) {
186                                 /* node contains more than 1 item, or item is not directory item, or this item contains more than 1 entry */
187                                 print_block(Sh, 0, -1, -1);
188                                 reiserfs_panic(tb->tb_sb, "vs-8045",
189                                                "rdkey %k, affected item==%d "
190                                                "(mode==%c) Must be %c",
191                                                key, vn->vn_affected_item_num,
192                                                vn->vn_mode, M_DELETE);
193                         }
194                 }
195 #endif
196
197         }
198 }
199
200 /* using virtual node check, how many items can be shifted to left
201    neighbor */
202 static void check_left(struct tree_balance *tb, int h, int cur_free)
203 {
204         int i;
205         struct virtual_node *vn = tb->tb_vn;
206         struct virtual_item *vi;
207         int d_size, ih_size;
208
209         RFALSE(cur_free < 0, "vs-8050: cur_free (%d) < 0", cur_free);
210
211         /* internal level */
212         if (h > 0) {
213                 tb->lnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
214                 return;
215         }
216
217         /* leaf level */
218
219         if (!cur_free || !vn->vn_nr_item) {
220                 /* no free space or nothing to move */
221                 tb->lnum[h] = 0;
222                 tb->lbytes = -1;
223                 return;
224         }
225
226         RFALSE(!PATH_H_PPARENT(tb->tb_path, 0),
227                "vs-8055: parent does not exist or invalid");
228
229         vi = vn->vn_vi;
230         if ((unsigned int)cur_free >=
231             (vn->vn_size -
232              ((vi->vi_type & VI_TYPE_LEFT_MERGEABLE) ? IH_SIZE : 0))) {
233                 /* all contents of S[0] fits into L[0] */
234
235                 RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
236                        "vs-8055: invalid mode or balance condition failed");
237
238                 tb->lnum[0] = vn->vn_nr_item;
239                 tb->lbytes = -1;
240                 return;
241         }
242
243         d_size = 0, ih_size = IH_SIZE;
244
245         /* first item may be merge with last item in left neighbor */
246         if (vi->vi_type & VI_TYPE_LEFT_MERGEABLE)
247                 d_size = -((int)IH_SIZE), ih_size = 0;
248
249         tb->lnum[0] = 0;
250         for (i = 0; i < vn->vn_nr_item;
251              i++, ih_size = IH_SIZE, d_size = 0, vi++) {
252                 d_size += vi->vi_item_len;
253                 if (cur_free >= d_size) {
254                         /* the item can be shifted entirely */
255                         cur_free -= d_size;
256                         tb->lnum[0]++;
257                         continue;
258                 }
259
260                 /* the item cannot be shifted entirely, try to split it */
261                 /* check whether L[0] can hold ih and at least one byte of the item body */
262                 if (cur_free <= ih_size) {
263                         /* cannot shift even a part of the current item */
264                         tb->lbytes = -1;
265                         return;
266                 }
267                 cur_free -= ih_size;
268
269                 tb->lbytes = op_check_left(vi, cur_free, 0, 0);
270                 if (tb->lbytes != -1)
271                         /* count partially shifted item */
272                         tb->lnum[0]++;
273
274                 break;
275         }
276
277         return;
278 }
279
280 /* using virtual node check, how many items can be shifted to right
281    neighbor */
282 static void check_right(struct tree_balance *tb, int h, int cur_free)
283 {
284         int i;
285         struct virtual_node *vn = tb->tb_vn;
286         struct virtual_item *vi;
287         int d_size, ih_size;
288
289         RFALSE(cur_free < 0, "vs-8070: cur_free < 0");
290
291         /* internal level */
292         if (h > 0) {
293                 tb->rnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
294                 return;
295         }
296
297         /* leaf level */
298
299         if (!cur_free || !vn->vn_nr_item) {
300                 /* no free space  */
301                 tb->rnum[h] = 0;
302                 tb->rbytes = -1;
303                 return;
304         }
305
306         RFALSE(!PATH_H_PPARENT(tb->tb_path, 0),
307                "vs-8075: parent does not exist or invalid");
308
309         vi = vn->vn_vi + vn->vn_nr_item - 1;
310         if ((unsigned int)cur_free >=
311             (vn->vn_size -
312              ((vi->vi_type & VI_TYPE_RIGHT_MERGEABLE) ? IH_SIZE : 0))) {
313                 /* all contents of S[0] fits into R[0] */
314
315                 RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
316                        "vs-8080: invalid mode or balance condition failed");
317
318                 tb->rnum[h] = vn->vn_nr_item;
319                 tb->rbytes = -1;
320                 return;
321         }
322
323         d_size = 0, ih_size = IH_SIZE;
324
325         /* last item may be merge with first item in right neighbor */
326         if (vi->vi_type & VI_TYPE_RIGHT_MERGEABLE)
327                 d_size = -(int)IH_SIZE, ih_size = 0;
328
329         tb->rnum[0] = 0;
330         for (i = vn->vn_nr_item - 1; i >= 0;
331              i--, d_size = 0, ih_size = IH_SIZE, vi--) {
332                 d_size += vi->vi_item_len;
333                 if (cur_free >= d_size) {
334                         /* the item can be shifted entirely */
335                         cur_free -= d_size;
336                         tb->rnum[0]++;
337                         continue;
338                 }
339
340                 /* check whether R[0] can hold ih and at least one byte of the item body */
341                 if (cur_free <= ih_size) {      /* cannot shift even a part of the current item */
342                         tb->rbytes = -1;
343                         return;
344                 }
345
346                 /* R[0] can hold the header of the item and at least one byte of its body */
347                 cur_free -= ih_size;    /* cur_free is still > 0 */
348
349                 tb->rbytes = op_check_right(vi, cur_free);
350                 if (tb->rbytes != -1)
351                         /* count partially shifted item */
352                         tb->rnum[0]++;
353
354                 break;
355         }
356
357         return;
358 }
359
360 /*
361  * from - number of items, which are shifted to left neighbor entirely
362  * to - number of item, which are shifted to right neighbor entirely
363  * from_bytes - number of bytes of boundary item (or directory entries) which are shifted to left neighbor
364  * to_bytes - number of bytes of boundary item (or directory entries) which are shifted to right neighbor */
365 static int get_num_ver(int mode, struct tree_balance *tb, int h,
366                        int from, int from_bytes,
367                        int to, int to_bytes, short *snum012, int flow)
368 {
369         int i;
370         int cur_free;
371         //    int bytes;
372         int units;
373         struct virtual_node *vn = tb->tb_vn;
374         //    struct virtual_item * vi;
375
376         int total_node_size, max_node_size, current_item_size;
377         int needed_nodes;
378         int start_item,         /* position of item we start filling node from */
379          end_item,              /* position of item we finish filling node by */
380          start_bytes,           /* number of first bytes (entries for directory) of start_item-th item
381                                    we do not include into node that is being filled */
382          end_bytes;             /* number of last bytes (entries for directory) of end_item-th item
383                                    we do node include into node that is being filled */
384         int split_item_positions[2];    /* these are positions in virtual item of
385                                            items, that are split between S[0] and
386                                            S1new and S1new and S2new */
387
388         split_item_positions[0] = -1;
389         split_item_positions[1] = -1;
390
391         /* We only create additional nodes if we are in insert or paste mode
392            or we are in replace mode at the internal level. If h is 0 and
393            the mode is M_REPLACE then in fix_nodes we change the mode to
394            paste or insert before we get here in the code.  */
395         RFALSE(tb->insert_size[h] < 0 || (mode != M_INSERT && mode != M_PASTE),
396                "vs-8100: insert_size < 0 in overflow");
397
398         max_node_size = MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, h));
399
400         /* snum012 [0-2] - number of items, that lay
401            to S[0], first new node and second new node */
402         snum012[3] = -1;        /* s1bytes */
403         snum012[4] = -1;        /* s2bytes */
404
405         /* internal level */
406         if (h > 0) {
407                 i = ((to - from) * (KEY_SIZE + DC_SIZE) + DC_SIZE);
408                 if (i == max_node_size)
409                         return 1;
410                 return (i / max_node_size + 1);
411         }
412
413         /* leaf level */
414         needed_nodes = 1;
415         total_node_size = 0;
416         cur_free = max_node_size;
417
418         // start from 'from'-th item
419         start_item = from;
420         // skip its first 'start_bytes' units
421         start_bytes = ((from_bytes != -1) ? from_bytes : 0);
422
423         // last included item is the 'end_item'-th one
424         end_item = vn->vn_nr_item - to - 1;
425         // do not count last 'end_bytes' units of 'end_item'-th item
426         end_bytes = (to_bytes != -1) ? to_bytes : 0;
427
428         /* go through all item beginning from the start_item-th item and ending by
429            the end_item-th item. Do not count first 'start_bytes' units of
430            'start_item'-th item and last 'end_bytes' of 'end_item'-th item */
431
432         for (i = start_item; i <= end_item; i++) {
433                 struct virtual_item *vi = vn->vn_vi + i;
434                 int skip_from_end = ((i == end_item) ? end_bytes : 0);
435
436                 RFALSE(needed_nodes > 3, "vs-8105: too many nodes are needed");
437
438                 /* get size of current item */
439                 current_item_size = vi->vi_item_len;
440
441                 /* do not take in calculation head part (from_bytes) of from-th item */
442                 current_item_size -=
443                     op_part_size(vi, 0 /*from start */ , start_bytes);
444
445                 /* do not take in calculation tail part of last item */
446                 current_item_size -=
447                     op_part_size(vi, 1 /*from end */ , skip_from_end);
448
449                 /* if item fits into current node entierly */
450                 if (total_node_size + current_item_size <= max_node_size) {
451                         snum012[needed_nodes - 1]++;
452                         total_node_size += current_item_size;
453                         start_bytes = 0;
454                         continue;
455                 }
456
457                 if (current_item_size > max_node_size) {
458                         /* virtual item length is longer, than max size of item in
459                            a node. It is impossible for direct item */
460                         RFALSE(is_direct_le_ih(vi->vi_ih),
461                                "vs-8110: "
462                                "direct item length is %d. It can not be longer than %d",
463                                current_item_size, max_node_size);
464                         /* we will try to split it */
465                         flow = 1;
466                 }
467
468                 if (!flow) {
469                         /* as we do not split items, take new node and continue */
470                         needed_nodes++;
471                         i--;
472                         total_node_size = 0;
473                         continue;
474                 }
475                 // calculate number of item units which fit into node being
476                 // filled
477                 {
478                         int free_space;
479
480                         free_space = max_node_size - total_node_size - IH_SIZE;
481                         units =
482                             op_check_left(vi, free_space, start_bytes,
483                                           skip_from_end);
484                         if (units == -1) {
485                                 /* nothing fits into current node, take new node and continue */
486                                 needed_nodes++, i--, total_node_size = 0;
487                                 continue;
488                         }
489                 }
490
491                 /* something fits into the current node */
492                 //if (snum012[3] != -1 || needed_nodes != 1)
493                 //  reiserfs_panic (tb->tb_sb, "vs-8115: get_num_ver: too many nodes required");
494                 //snum012[needed_nodes - 1 + 3] = op_unit_num (vi) - start_bytes - units;
495                 start_bytes += units;
496                 snum012[needed_nodes - 1 + 3] = units;
497
498                 if (needed_nodes > 2)
499                         reiserfs_warning(tb->tb_sb, "vs-8111",
500                                          "split_item_position is out of range");
501                 snum012[needed_nodes - 1]++;
502                 split_item_positions[needed_nodes - 1] = i;
503                 needed_nodes++;
504                 /* continue from the same item with start_bytes != -1 */
505                 start_item = i;
506                 i--;
507                 total_node_size = 0;
508         }
509
510         // sum012[4] (if it is not -1) contains number of units of which
511         // are to be in S1new, snum012[3] - to be in S0. They are supposed
512         // to be S1bytes and S2bytes correspondingly, so recalculate
513         if (snum012[4] > 0) {
514                 int split_item_num;
515                 int bytes_to_r, bytes_to_l;
516                 int bytes_to_S1new;
517
518                 split_item_num = split_item_positions[1];
519                 bytes_to_l =
520                     ((from == split_item_num
521                       && from_bytes != -1) ? from_bytes : 0);
522                 bytes_to_r =
523                     ((end_item == split_item_num
524                       && end_bytes != -1) ? end_bytes : 0);
525                 bytes_to_S1new =
526                     ((split_item_positions[0] ==
527                       split_item_positions[1]) ? snum012[3] : 0);
528
529                 // s2bytes
530                 snum012[4] =
531                     op_unit_num(&vn->vn_vi[split_item_num]) - snum012[4] -
532                     bytes_to_r - bytes_to_l - bytes_to_S1new;
533
534                 if (vn->vn_vi[split_item_num].vi_index != TYPE_DIRENTRY &&
535                     vn->vn_vi[split_item_num].vi_index != TYPE_INDIRECT)
536                         reiserfs_warning(tb->tb_sb, "vs-8115",
537                                          "not directory or indirect item");
538         }
539
540         /* now we know S2bytes, calculate S1bytes */
541         if (snum012[3] > 0) {
542                 int split_item_num;
543                 int bytes_to_r, bytes_to_l;
544                 int bytes_to_S2new;
545
546                 split_item_num = split_item_positions[0];
547                 bytes_to_l =
548                     ((from == split_item_num
549                       && from_bytes != -1) ? from_bytes : 0);
550                 bytes_to_r =
551                     ((end_item == split_item_num
552                       && end_bytes != -1) ? end_bytes : 0);
553                 bytes_to_S2new =
554                     ((split_item_positions[0] == split_item_positions[1]
555                       && snum012[4] != -1) ? snum012[4] : 0);
556
557                 // s1bytes
558                 snum012[3] =
559                     op_unit_num(&vn->vn_vi[split_item_num]) - snum012[3] -
560                     bytes_to_r - bytes_to_l - bytes_to_S2new;
561         }
562
563         return needed_nodes;
564 }
565
566 #ifdef CONFIG_REISERFS_CHECK
567 extern struct tree_balance *cur_tb;
568 #endif
569
570 /* Set parameters for balancing.
571  * Performs write of results of analysis of balancing into structure tb,
572  * where it will later be used by the functions that actually do the balancing.
573  * Parameters:
574  *      tb      tree_balance structure;
575  *      h       current level of the node;
576  *      lnum    number of items from S[h] that must be shifted to L[h];
577  *      rnum    number of items from S[h] that must be shifted to R[h];
578  *      blk_num number of blocks that S[h] will be splitted into;
579  *      s012    number of items that fall into splitted nodes.
580  *      lbytes  number of bytes which flow to the left neighbor from the item that is not
581  *              not shifted entirely
582  *      rbytes  number of bytes which flow to the right neighbor from the item that is not
583  *              not shifted entirely
584  *      s1bytes number of bytes which flow to the first  new node when S[0] splits (this number is contained in s012 array)
585  */
586
587 static void set_parameters(struct tree_balance *tb, int h, int lnum,
588                            int rnum, int blk_num, short *s012, int lb, int rb)
589 {
590
591         tb->lnum[h] = lnum;
592         tb->rnum[h] = rnum;
593         tb->blknum[h] = blk_num;
594
595         if (h == 0) {           /* only for leaf level */
596                 if (s012 != NULL) {
597                         tb->s0num = *s012++,
598                             tb->s1num = *s012++, tb->s2num = *s012++;
599                         tb->s1bytes = *s012++;
600                         tb->s2bytes = *s012;
601                 }
602                 tb->lbytes = lb;
603                 tb->rbytes = rb;
604         }
605         PROC_INFO_ADD(tb->tb_sb, lnum[h], lnum);
606         PROC_INFO_ADD(tb->tb_sb, rnum[h], rnum);
607
608         PROC_INFO_ADD(tb->tb_sb, lbytes[h], lb);
609         PROC_INFO_ADD(tb->tb_sb, rbytes[h], rb);
610 }
611
612 /* check, does node disappear if we shift tb->lnum[0] items to left
613    neighbor and tb->rnum[0] to the right one. */
614 static int is_leaf_removable(struct tree_balance *tb)
615 {
616         struct virtual_node *vn = tb->tb_vn;
617         int to_left, to_right;
618         int size;
619         int remain_items;
620
621         /* number of items, that will be shifted to left (right) neighbor
622            entirely */
623         to_left = tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0);
624         to_right = tb->rnum[0] - ((tb->rbytes != -1) ? 1 : 0);
625         remain_items = vn->vn_nr_item;
626
627         /* how many items remain in S[0] after shiftings to neighbors */
628         remain_items -= (to_left + to_right);
629
630         if (remain_items < 1) {
631                 /* all content of node can be shifted to neighbors */
632                 set_parameters(tb, 0, to_left, vn->vn_nr_item - to_left, 0,
633                                NULL, -1, -1);
634                 return 1;
635         }
636
637         if (remain_items > 1 || tb->lbytes == -1 || tb->rbytes == -1)
638                 /* S[0] is not removable */
639                 return 0;
640
641         /* check, whether we can divide 1 remaining item between neighbors */
642
643         /* get size of remaining item (in item units) */
644         size = op_unit_num(&(vn->vn_vi[to_left]));
645
646         if (tb->lbytes + tb->rbytes >= size) {
647                 set_parameters(tb, 0, to_left + 1, to_right + 1, 0, NULL,
648                                tb->lbytes, -1);
649                 return 1;
650         }
651
652         return 0;
653 }
654
655 /* check whether L, S, R can be joined in one node */
656 static int are_leaves_removable(struct tree_balance *tb, int lfree, int rfree)
657 {
658         struct virtual_node *vn = tb->tb_vn;
659         int ih_size;
660         struct buffer_head *S0;
661
662         S0 = PATH_H_PBUFFER(tb->tb_path, 0);
663
664         ih_size = 0;
665         if (vn->vn_nr_item) {
666                 if (vn->vn_vi[0].vi_type & VI_TYPE_LEFT_MERGEABLE)
667                         ih_size += IH_SIZE;
668
669                 if (vn->vn_vi[vn->vn_nr_item - 1].
670                     vi_type & VI_TYPE_RIGHT_MERGEABLE)
671                         ih_size += IH_SIZE;
672         } else {
673                 /* there was only one item and it will be deleted */
674                 struct item_head *ih;
675
676                 RFALSE(B_NR_ITEMS(S0) != 1,
677                        "vs-8125: item number must be 1: it is %d",
678                        B_NR_ITEMS(S0));
679
680                 ih = B_N_PITEM_HEAD(S0, 0);
681                 if (tb->CFR[0]
682                     && !comp_short_le_keys(&(ih->ih_key),
683                                            B_N_PDELIM_KEY(tb->CFR[0],
684                                                           tb->rkey[0])))
685                         if (is_direntry_le_ih(ih)) {
686                                 /* Directory must be in correct state here: that is
687                                    somewhere at the left side should exist first directory
688                                    item. But the item being deleted can not be that first
689                                    one because its right neighbor is item of the same
690                                    directory. (But first item always gets deleted in last
691                                    turn). So, neighbors of deleted item can be merged, so
692                                    we can save ih_size */
693                                 ih_size = IH_SIZE;
694
695                                 /* we might check that left neighbor exists and is of the
696                                    same directory */
697                                 RFALSE(le_ih_k_offset(ih) == DOT_OFFSET,
698                                        "vs-8130: first directory item can not be removed until directory is not empty");
699                         }
700
701         }
702
703         if (MAX_CHILD_SIZE(S0) + vn->vn_size <= rfree + lfree + ih_size) {
704                 set_parameters(tb, 0, -1, -1, -1, NULL, -1, -1);
705                 PROC_INFO_INC(tb->tb_sb, leaves_removable);
706                 return 1;
707         }
708         return 0;
709
710 }
711
712 /* when we do not split item, lnum and rnum are numbers of entire items */
713 #define SET_PAR_SHIFT_LEFT \
714 if (h)\
715 {\
716    int to_l;\
717    \
718    to_l = (MAX_NR_KEY(Sh)+1 - lpar + vn->vn_nr_item + 1) / 2 -\
719               (MAX_NR_KEY(Sh) + 1 - lpar);\
720               \
721               set_parameters (tb, h, to_l, 0, lnver, NULL, -1, -1);\
722 }\
723 else \
724 {\
725    if (lset==LEFT_SHIFT_FLOW)\
726      set_parameters (tb, h, lpar, 0, lnver, snum012+lset,\
727                      tb->lbytes, -1);\
728    else\
729      set_parameters (tb, h, lpar - (tb->lbytes!=-1), 0, lnver, snum012+lset,\
730                      -1, -1);\
731 }
732
733 #define SET_PAR_SHIFT_RIGHT \
734 if (h)\
735 {\
736    int to_r;\
737    \
738    to_r = (MAX_NR_KEY(Sh)+1 - rpar + vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - rpar);\
739    \
740    set_parameters (tb, h, 0, to_r, rnver, NULL, -1, -1);\
741 }\
742 else \
743 {\
744    if (rset==RIGHT_SHIFT_FLOW)\
745      set_parameters (tb, h, 0, rpar, rnver, snum012+rset,\
746                   -1, tb->rbytes);\
747    else\
748      set_parameters (tb, h, 0, rpar - (tb->rbytes!=-1), rnver, snum012+rset,\
749                   -1, -1);\
750 }
751
752 static void free_buffers_in_tb(struct tree_balance *tb)
753 {
754         int i;
755
756         pathrelse(tb->tb_path);
757
758         for (i = 0; i < MAX_HEIGHT; i++) {
759                 brelse(tb->L[i]);
760                 brelse(tb->R[i]);
761                 brelse(tb->FL[i]);
762                 brelse(tb->FR[i]);
763                 brelse(tb->CFL[i]);
764                 brelse(tb->CFR[i]);
765
766                 tb->L[i] = NULL;
767                 tb->R[i] = NULL;
768                 tb->FL[i] = NULL;
769                 tb->FR[i] = NULL;
770                 tb->CFL[i] = NULL;
771                 tb->CFR[i] = NULL;
772         }
773 }
774
775 /* Get new buffers for storing new nodes that are created while balancing.
776  * Returns:     SCHEDULE_OCCURRED - schedule occurred while the function worked;
777  *              CARRY_ON - schedule didn't occur while the function worked;
778  *              NO_DISK_SPACE - no disk space.
779  */
780 /* The function is NOT SCHEDULE-SAFE! */
781 static int get_empty_nodes(struct tree_balance *tb, int h)
782 {
783         struct buffer_head *new_bh,
784             *Sh = PATH_H_PBUFFER(tb->tb_path, h);
785         b_blocknr_t *blocknr, blocknrs[MAX_AMOUNT_NEEDED] = { 0, };
786         int counter, number_of_freeblk, amount_needed,  /* number of needed empty blocks */
787          retval = CARRY_ON;
788         struct super_block *sb = tb->tb_sb;
789
790         /* number_of_freeblk is the number of empty blocks which have been
791            acquired for use by the balancing algorithm minus the number of
792            empty blocks used in the previous levels of the analysis,
793            number_of_freeblk = tb->cur_blknum can be non-zero if a schedule occurs
794            after empty blocks are acquired, and the balancing analysis is
795            then restarted, amount_needed is the number needed by this level
796            (h) of the balancing analysis.
797
798            Note that for systems with many processes writing, it would be
799            more layout optimal to calculate the total number needed by all
800            levels and then to run reiserfs_new_blocks to get all of them at once.  */
801
802         /* Initiate number_of_freeblk to the amount acquired prior to the restart of
803            the analysis or 0 if not restarted, then subtract the amount needed
804            by all of the levels of the tree below h. */
805         /* blknum includes S[h], so we subtract 1 in this calculation */
806         for (counter = 0, number_of_freeblk = tb->cur_blknum;
807              counter < h; counter++)
808                 number_of_freeblk -=
809                     (tb->blknum[counter]) ? (tb->blknum[counter] -
810                                                    1) : 0;
811
812         /* Allocate missing empty blocks. */
813         /* if Sh == 0  then we are getting a new root */
814         amount_needed = (Sh) ? (tb->blknum[h] - 1) : 1;
815         /*  Amount_needed = the amount that we need more than the amount that we have. */
816         if (amount_needed > number_of_freeblk)
817                 amount_needed -= number_of_freeblk;
818         else                    /* If we have enough already then there is nothing to do. */
819                 return CARRY_ON;
820
821         /* No need to check quota - is not allocated for blocks used for formatted nodes */
822         if (reiserfs_new_form_blocknrs(tb, blocknrs,
823                                        amount_needed) == NO_DISK_SPACE)
824                 return NO_DISK_SPACE;
825
826         /* for each blocknumber we just got, get a buffer and stick it on FEB */
827         for (blocknr = blocknrs, counter = 0;
828              counter < amount_needed; blocknr++, counter++) {
829
830                 RFALSE(!*blocknr,
831                        "PAP-8135: reiserfs_new_blocknrs failed when got new blocks");
832
833                 new_bh = sb_getblk(sb, *blocknr);
834                 RFALSE(buffer_dirty(new_bh) ||
835                        buffer_journaled(new_bh) ||
836                        buffer_journal_dirty(new_bh),
837                        "PAP-8140: journlaled or dirty buffer %b for the new block",
838                        new_bh);
839
840                 /* Put empty buffers into the array. */
841                 RFALSE(tb->FEB[tb->cur_blknum],
842                        "PAP-8141: busy slot for new buffer");
843
844                 set_buffer_journal_new(new_bh);
845                 tb->FEB[tb->cur_blknum++] = new_bh;
846         }
847
848         if (retval == CARRY_ON && FILESYSTEM_CHANGED_TB(tb))
849                 retval = REPEAT_SEARCH;
850
851         return retval;
852 }
853
854 /* Get free space of the left neighbor, which is stored in the parent
855  * node of the left neighbor.  */
856 static int get_lfree(struct tree_balance *tb, int h)
857 {
858         struct buffer_head *l, *f;
859         int order;
860
861         if ((f = PATH_H_PPARENT(tb->tb_path, h)) == NULL ||
862             (l = tb->FL[h]) == NULL)
863                 return 0;
864
865         if (f == l)
866                 order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) - 1;
867         else {
868                 order = B_NR_ITEMS(l);
869                 f = l;
870         }
871
872         return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order)));
873 }
874
875 /* Get free space of the right neighbor,
876  * which is stored in the parent node of the right neighbor.
877  */
878 static int get_rfree(struct tree_balance *tb, int h)
879 {
880         struct buffer_head *r, *f;
881         int order;
882
883         if ((f = PATH_H_PPARENT(tb->tb_path, h)) == NULL ||
884             (r = tb->FR[h]) == NULL)
885                 return 0;
886
887         if (f == r)
888                 order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) + 1;
889         else {
890                 order = 0;
891                 f = r;
892         }
893
894         return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order)));
895
896 }
897
898 /* Check whether left neighbor is in memory. */
899 static int is_left_neighbor_in_cache(struct tree_balance *tb, int h)
900 {
901         struct buffer_head *father, *left;
902         struct super_block *sb = tb->tb_sb;
903         b_blocknr_t left_neighbor_blocknr;
904         int left_neighbor_position;
905
906         /* Father of the left neighbor does not exist. */
907         if (!tb->FL[h])
908                 return 0;
909
910         /* Calculate father of the node to be balanced. */
911         father = PATH_H_PBUFFER(tb->tb_path, h + 1);
912
913         RFALSE(!father ||
914                !B_IS_IN_TREE(father) ||
915                !B_IS_IN_TREE(tb->FL[h]) ||
916                !buffer_uptodate(father) ||
917                !buffer_uptodate(tb->FL[h]),
918                "vs-8165: F[h] (%b) or FL[h] (%b) is invalid",
919                father, tb->FL[h]);
920
921         /* Get position of the pointer to the left neighbor into the left father. */
922         left_neighbor_position = (father == tb->FL[h]) ?
923             tb->lkey[h] : B_NR_ITEMS(tb->FL[h]);
924         /* Get left neighbor block number. */
925         left_neighbor_blocknr =
926             B_N_CHILD_NUM(tb->FL[h], left_neighbor_position);
927         /* Look for the left neighbor in the cache. */
928         if ((left = sb_find_get_block(sb, left_neighbor_blocknr))) {
929
930                 RFALSE(buffer_uptodate(left) && !B_IS_IN_TREE(left),
931                        "vs-8170: left neighbor (%b %z) is not in the tree",
932                        left, left);
933                 put_bh(left);
934                 return 1;
935         }
936
937         return 0;
938 }
939
940 #define LEFT_PARENTS  'l'
941 #define RIGHT_PARENTS 'r'
942
943 static void decrement_key(struct cpu_key *key)
944 {
945         // call item specific function for this key
946         item_ops[cpu_key_k_type(key)]->decrement_key(key);
947 }
948
949 /* Calculate far left/right parent of the left/right neighbor of the current node, that
950  * is calculate the left/right (FL[h]/FR[h]) neighbor of the parent F[h].
951  * Calculate left/right common parent of the current node and L[h]/R[h].
952  * Calculate left/right delimiting key position.
953  * Returns:     PATH_INCORRECT   - path in the tree is not correct;
954                 SCHEDULE_OCCURRED - schedule occurred while the function worked;
955  *              CARRY_ON         - schedule didn't occur while the function worked;
956  */
957 static int get_far_parent(struct tree_balance *tb,
958                           int h,
959                           struct buffer_head **pfather,
960                           struct buffer_head **pcom_father, char c_lr_par)
961 {
962         struct buffer_head *parent;
963         INITIALIZE_PATH(s_path_to_neighbor_father);
964         struct treepath *path = tb->tb_path;
965         struct cpu_key s_lr_father_key;
966         int counter,
967             position = INT_MAX,
968             first_last_position = 0,
969             path_offset = PATH_H_PATH_OFFSET(path, h);
970
971         /* Starting from F[h] go upwards in the tree, and look for the common
972            ancestor of F[h], and its neighbor l/r, that should be obtained. */
973
974         counter = path_offset;
975
976         RFALSE(counter < FIRST_PATH_ELEMENT_OFFSET,
977                "PAP-8180: invalid path length");
978
979         for (; counter > FIRST_PATH_ELEMENT_OFFSET; counter--) {
980                 /* Check whether parent of the current buffer in the path is really parent in the tree. */
981                 if (!B_IS_IN_TREE
982                     (parent = PATH_OFFSET_PBUFFER(path, counter - 1)))
983                         return REPEAT_SEARCH;
984                 /* Check whether position in the parent is correct. */
985                 if ((position =
986                      PATH_OFFSET_POSITION(path,
987                                           counter - 1)) >
988                     B_NR_ITEMS(parent))
989                         return REPEAT_SEARCH;
990                 /* Check whether parent at the path really points to the child. */
991                 if (B_N_CHILD_NUM(parent, position) !=
992                     PATH_OFFSET_PBUFFER(path, counter)->b_blocknr)
993                         return REPEAT_SEARCH;
994                 /* Return delimiting key if position in the parent is not equal to first/last one. */
995                 if (c_lr_par == RIGHT_PARENTS)
996                         first_last_position = B_NR_ITEMS(parent);
997                 if (position != first_last_position) {
998                         *pcom_father = parent;
999                         get_bh(*pcom_father);
1000                         /*(*pcom_father = parent)->b_count++; */
1001                         break;
1002                 }
1003         }
1004
1005         /* if we are in the root of the tree, then there is no common father */
1006         if (counter == FIRST_PATH_ELEMENT_OFFSET) {
1007                 /* Check whether first buffer in the path is the root of the tree. */
1008                 if (PATH_OFFSET_PBUFFER
1009                     (tb->tb_path,
1010                      FIRST_PATH_ELEMENT_OFFSET)->b_blocknr ==
1011                     SB_ROOT_BLOCK(tb->tb_sb)) {
1012                         *pfather = *pcom_father = NULL;
1013                         return CARRY_ON;
1014                 }
1015                 return REPEAT_SEARCH;
1016         }
1017
1018         RFALSE(B_LEVEL(*pcom_father) <= DISK_LEAF_NODE_LEVEL,
1019                "PAP-8185: (%b %z) level too small",
1020                *pcom_father, *pcom_father);
1021
1022         /* Check whether the common parent is locked. */
1023
1024         if (buffer_locked(*pcom_father)) {
1025                 __wait_on_buffer(*pcom_father);
1026                 if (FILESYSTEM_CHANGED_TB(tb)) {
1027                         brelse(*pcom_father);
1028                         return REPEAT_SEARCH;
1029                 }
1030         }
1031
1032         /* So, we got common parent of the current node and its left/right neighbor.
1033            Now we are geting the parent of the left/right neighbor. */
1034
1035         /* Form key to get parent of the left/right neighbor. */
1036         le_key2cpu_key(&s_lr_father_key,
1037                        B_N_PDELIM_KEY(*pcom_father,
1038                                       (c_lr_par ==
1039                                        LEFT_PARENTS) ? (tb->lkey[h - 1] =
1040                                                         position -
1041                                                         1) : (tb->rkey[h -
1042                                                                            1] =
1043                                                               position)));
1044
1045         if (c_lr_par == LEFT_PARENTS)
1046                 decrement_key(&s_lr_father_key);
1047
1048         if (search_by_key
1049             (tb->tb_sb, &s_lr_father_key, &s_path_to_neighbor_father,
1050              h + 1) == IO_ERROR)
1051                 // path is released
1052                 return IO_ERROR;
1053
1054         if (FILESYSTEM_CHANGED_TB(tb)) {
1055                 pathrelse(&s_path_to_neighbor_father);
1056                 brelse(*pcom_father);
1057                 return REPEAT_SEARCH;
1058         }
1059
1060         *pfather = PATH_PLAST_BUFFER(&s_path_to_neighbor_father);
1061
1062         RFALSE(B_LEVEL(*pfather) != h + 1,
1063                "PAP-8190: (%b %z) level too small", *pfather, *pfather);
1064         RFALSE(s_path_to_neighbor_father.path_length <
1065                FIRST_PATH_ELEMENT_OFFSET, "PAP-8192: path length is too small");
1066
1067         s_path_to_neighbor_father.path_length--;
1068         pathrelse(&s_path_to_neighbor_father);
1069         return CARRY_ON;
1070 }
1071
1072 /* Get parents of neighbors of node in the path(S[path_offset]) and common parents of
1073  * S[path_offset] and L[path_offset]/R[path_offset]: F[path_offset], FL[path_offset],
1074  * FR[path_offset], CFL[path_offset], CFR[path_offset].
1075  * Calculate numbers of left and right delimiting keys position: lkey[path_offset], rkey[path_offset].
1076  * Returns:     SCHEDULE_OCCURRED - schedule occurred while the function worked;
1077  *              CARRY_ON - schedule didn't occur while the function worked;
1078  */
1079 static int get_parents(struct tree_balance *tb, int h)
1080 {
1081         struct treepath *path = tb->tb_path;
1082         int position,
1083             ret,
1084             path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h);
1085         struct buffer_head *curf, *curcf;
1086
1087         /* Current node is the root of the tree or will be root of the tree */
1088         if (path_offset <= FIRST_PATH_ELEMENT_OFFSET) {
1089                 /* The root can not have parents.
1090                    Release nodes which previously were obtained as parents of the current node neighbors. */
1091                 brelse(tb->FL[h]);
1092                 brelse(tb->CFL[h]);
1093                 brelse(tb->FR[h]);
1094                 brelse(tb->CFR[h]);
1095                 tb->FL[h]  = NULL;
1096                 tb->CFL[h] = NULL;
1097                 tb->FR[h]  = NULL;
1098                 tb->CFR[h] = NULL;
1099                 return CARRY_ON;
1100         }
1101
1102         /* Get parent FL[path_offset] of L[path_offset]. */
1103         position = PATH_OFFSET_POSITION(path, path_offset - 1);
1104         if (position) {
1105                 /* Current node is not the first child of its parent. */
1106                 curf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
1107                 curcf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
1108                 get_bh(curf);
1109                 get_bh(curf);
1110                 tb->lkey[h] = position - 1;
1111         } else {
1112                 /* Calculate current parent of L[path_offset], which is the left neighbor of the current node.
1113                    Calculate current common parent of L[path_offset] and the current node. Note that
1114                    CFL[path_offset] not equal FL[path_offset] and CFL[path_offset] not equal F[path_offset].
1115                    Calculate lkey[path_offset]. */
1116                 if ((ret = get_far_parent(tb, h + 1, &curf,
1117                                                   &curcf,
1118                                                   LEFT_PARENTS)) != CARRY_ON)
1119                         return ret;
1120         }
1121
1122         brelse(tb->FL[h]);
1123         tb->FL[h] = curf;       /* New initialization of FL[h]. */
1124         brelse(tb->CFL[h]);
1125         tb->CFL[h] = curcf;     /* New initialization of CFL[h]. */
1126
1127         RFALSE((curf && !B_IS_IN_TREE(curf)) ||
1128                (curcf && !B_IS_IN_TREE(curcf)),
1129                "PAP-8195: FL (%b) or CFL (%b) is invalid", curf, curcf);
1130
1131 /* Get parent FR[h] of R[h]. */
1132
1133 /* Current node is the last child of F[h]. FR[h] != F[h]. */
1134         if (position == B_NR_ITEMS(PATH_H_PBUFFER(path, h + 1))) {
1135 /* Calculate current parent of R[h], which is the right neighbor of F[h].
1136    Calculate current common parent of R[h] and current node. Note that CFR[h]
1137    not equal FR[path_offset] and CFR[h] not equal F[h]. */
1138                 if ((ret =
1139                      get_far_parent(tb, h + 1, &curf, &curcf,
1140                                     RIGHT_PARENTS)) != CARRY_ON)
1141                         return ret;
1142         } else {
1143 /* Current node is not the last child of its parent F[h]. */
1144                 curf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
1145                 curcf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
1146                 get_bh(curf);
1147                 get_bh(curf);
1148                 tb->rkey[h] = position;
1149         }
1150
1151         brelse(tb->FR[h]);
1152         /* New initialization of FR[path_offset]. */
1153         tb->FR[h] = curf;
1154
1155         brelse(tb->CFR[h]);
1156         /* New initialization of CFR[path_offset]. */
1157         tb->CFR[h] = curcf;
1158
1159         RFALSE((curf && !B_IS_IN_TREE(curf)) ||
1160                (curcf && !B_IS_IN_TREE(curcf)),
1161                "PAP-8205: FR (%b) or CFR (%b) is invalid", curf, curcf);
1162
1163         return CARRY_ON;
1164 }
1165
1166 /* it is possible to remove node as result of shiftings to
1167    neighbors even when we insert or paste item. */
1168 static inline int can_node_be_removed(int mode, int lfree, int sfree, int rfree,
1169                                       struct tree_balance *tb, int h)
1170 {
1171         struct buffer_head *Sh = PATH_H_PBUFFER(tb->tb_path, h);
1172         int levbytes = tb->insert_size[h];
1173         struct item_head *ih;
1174         struct reiserfs_key *r_key = NULL;
1175
1176         ih = B_N_PITEM_HEAD(Sh, 0);
1177         if (tb->CFR[h])
1178                 r_key = B_N_PDELIM_KEY(tb->CFR[h], tb->rkey[h]);
1179
1180         if (lfree + rfree + sfree < MAX_CHILD_SIZE(Sh) + levbytes
1181             /* shifting may merge items which might save space */
1182             -
1183             ((!h
1184               && op_is_left_mergeable(&(ih->ih_key), Sh->b_size)) ? IH_SIZE : 0)
1185             -
1186             ((!h && r_key
1187               && op_is_left_mergeable(r_key, Sh->b_size)) ? IH_SIZE : 0)
1188             + ((h) ? KEY_SIZE : 0)) {
1189                 /* node can not be removed */
1190                 if (sfree >= levbytes) {        /* new item fits into node S[h] without any shifting */
1191                         if (!h)
1192                                 tb->s0num =
1193                                     B_NR_ITEMS(Sh) +
1194                                     ((mode == M_INSERT) ? 1 : 0);
1195                         set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1196                         return NO_BALANCING_NEEDED;
1197                 }
1198         }
1199         PROC_INFO_INC(tb->tb_sb, can_node_be_removed[h]);
1200         return !NO_BALANCING_NEEDED;
1201 }
1202
1203 /* Check whether current node S[h] is balanced when increasing its size by
1204  * Inserting or Pasting.
1205  * Calculate parameters for balancing for current level h.
1206  * Parameters:
1207  *      tb      tree_balance structure;
1208  *      h       current level of the node;
1209  *      inum    item number in S[h];
1210  *      mode    i - insert, p - paste;
1211  * Returns:     1 - schedule occurred;
1212  *              0 - balancing for higher levels needed;
1213  *             -1 - no balancing for higher levels needed;
1214  *             -2 - no disk space.
1215  */
1216 /* ip means Inserting or Pasting */
1217 static int ip_check_balance(struct tree_balance *tb, int h)
1218 {
1219         struct virtual_node *vn = tb->tb_vn;
1220         int levbytes,           /* Number of bytes that must be inserted into (value
1221                                    is negative if bytes are deleted) buffer which
1222                                    contains node being balanced.  The mnemonic is
1223                                    that the attempted change in node space used level
1224                                    is levbytes bytes. */
1225          ret;
1226
1227         int lfree, sfree, rfree /* free space in L, S and R */ ;
1228
1229         /* nver is short for number of vertixes, and lnver is the number if
1230            we shift to the left, rnver is the number if we shift to the
1231            right, and lrnver is the number if we shift in both directions.
1232            The goal is to minimize first the number of vertixes, and second,
1233            the number of vertixes whose contents are changed by shifting,
1234            and third the number of uncached vertixes whose contents are
1235            changed by shifting and must be read from disk.  */
1236         int nver, lnver, rnver, lrnver;
1237
1238         /* used at leaf level only, S0 = S[0] is the node being balanced,
1239            sInum [ I = 0,1,2 ] is the number of items that will
1240            remain in node SI after balancing.  S1 and S2 are new
1241            nodes that might be created. */
1242
1243         /* we perform 8 calls to get_num_ver().  For each call we calculate five parameters.
1244            where 4th parameter is s1bytes and 5th - s2bytes
1245          */
1246         short snum012[40] = { 0, };     /* s0num, s1num, s2num for 8 cases
1247                                            0,1 - do not shift and do not shift but bottle
1248                                            2 - shift only whole item to left
1249                                            3 - shift to left and bottle as much as possible
1250                                            4,5 - shift to right (whole items and as much as possible
1251                                            6,7 - shift to both directions (whole items and as much as possible)
1252                                          */
1253
1254         /* Sh is the node whose balance is currently being checked */
1255         struct buffer_head *Sh;
1256
1257         Sh = PATH_H_PBUFFER(tb->tb_path, h);
1258         levbytes = tb->insert_size[h];
1259
1260         /* Calculate balance parameters for creating new root. */
1261         if (!Sh) {
1262                 if (!h)
1263                         reiserfs_panic(tb->tb_sb, "vs-8210",
1264                                        "S[0] can not be 0");
1265                 switch (ret = get_empty_nodes(tb, h)) {
1266                 case CARRY_ON:
1267                         set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1268                         return NO_BALANCING_NEEDED;     /* no balancing for higher levels needed */
1269
1270                 case NO_DISK_SPACE:
1271                 case REPEAT_SEARCH:
1272                         return ret;
1273                 default:
1274                         reiserfs_panic(tb->tb_sb, "vs-8215", "incorrect "
1275                                        "return value of get_empty_nodes");
1276                 }
1277         }
1278
1279         if ((ret = get_parents(tb, h)) != CARRY_ON)     /* get parents of S[h] neighbors. */
1280                 return ret;
1281
1282         sfree = B_FREE_SPACE(Sh);
1283
1284         /* get free space of neighbors */
1285         rfree = get_rfree(tb, h);
1286         lfree = get_lfree(tb, h);
1287
1288         if (can_node_be_removed(vn->vn_mode, lfree, sfree, rfree, tb, h) ==
1289             NO_BALANCING_NEEDED)
1290                 /* and new item fits into node S[h] without any shifting */
1291                 return NO_BALANCING_NEEDED;
1292
1293         create_virtual_node(tb, h);
1294
1295         /*
1296            determine maximal number of items we can shift to the left neighbor (in tb structure)
1297            and the maximal number of bytes that can flow to the left neighbor
1298            from the left most liquid item that cannot be shifted from S[0] entirely (returned value)
1299          */
1300         check_left(tb, h, lfree);
1301
1302         /*
1303            determine maximal number of items we can shift to the right neighbor (in tb structure)
1304            and the maximal number of bytes that can flow to the right neighbor
1305            from the right most liquid item that cannot be shifted from S[0] entirely (returned value)
1306          */
1307         check_right(tb, h, rfree);
1308
1309         /* all contents of internal node S[h] can be moved into its
1310            neighbors, S[h] will be removed after balancing */
1311         if (h && (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)) {
1312                 int to_r;
1313
1314                 /* Since we are working on internal nodes, and our internal
1315                    nodes have fixed size entries, then we can balance by the
1316                    number of items rather than the space they consume.  In this
1317                    routine we set the left node equal to the right node,
1318                    allowing a difference of less than or equal to 1 child
1319                    pointer. */
1320                 to_r =
1321                     ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] +
1322                      vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 -
1323                                                 tb->rnum[h]);
1324                 set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL,
1325                                -1, -1);
1326                 return CARRY_ON;
1327         }
1328
1329         /* this checks balance condition, that any two neighboring nodes can not fit in one node */
1330         RFALSE(h &&
1331                (tb->lnum[h] >= vn->vn_nr_item + 1 ||
1332                 tb->rnum[h] >= vn->vn_nr_item + 1),
1333                "vs-8220: tree is not balanced on internal level");
1334         RFALSE(!h && ((tb->lnum[h] >= vn->vn_nr_item && (tb->lbytes == -1)) ||
1335                       (tb->rnum[h] >= vn->vn_nr_item && (tb->rbytes == -1))),
1336                "vs-8225: tree is not balanced on leaf level");
1337
1338         /* all contents of S[0] can be moved into its neighbors
1339            S[0] will be removed after balancing. */
1340         if (!h && is_leaf_removable(tb))
1341                 return CARRY_ON;
1342
1343         /* why do we perform this check here rather than earlier??
1344            Answer: we can win 1 node in some cases above. Moreover we
1345            checked it above, when we checked, that S[0] is not removable
1346            in principle */
1347         if (sfree >= levbytes) {        /* new item fits into node S[h] without any shifting */
1348                 if (!h)
1349                         tb->s0num = vn->vn_nr_item;
1350                 set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1351                 return NO_BALANCING_NEEDED;
1352         }
1353
1354         {
1355                 int lpar, rpar, nset, lset, rset, lrset;
1356                 /*
1357                  * regular overflowing of the node
1358                  */
1359
1360                 /* get_num_ver works in 2 modes (FLOW & NO_FLOW)
1361                    lpar, rpar - number of items we can shift to left/right neighbor (including splitting item)
1362                    nset, lset, rset, lrset - shows, whether flowing items give better packing
1363                  */
1364 #define FLOW 1
1365 #define NO_FLOW 0               /* do not any splitting */
1366
1367                 /* we choose one the following */
1368 #define NOTHING_SHIFT_NO_FLOW   0
1369 #define NOTHING_SHIFT_FLOW      5
1370 #define LEFT_SHIFT_NO_FLOW      10
1371 #define LEFT_SHIFT_FLOW         15
1372 #define RIGHT_SHIFT_NO_FLOW     20
1373 #define RIGHT_SHIFT_FLOW        25
1374 #define LR_SHIFT_NO_FLOW        30
1375 #define LR_SHIFT_FLOW           35
1376
1377                 lpar = tb->lnum[h];
1378                 rpar = tb->rnum[h];
1379
1380                 /* calculate number of blocks S[h] must be split into when
1381                    nothing is shifted to the neighbors,
1382                    as well as number of items in each part of the split node (s012 numbers),
1383                    and number of bytes (s1bytes) of the shared drop which flow to S1 if any */
1384                 nset = NOTHING_SHIFT_NO_FLOW;
1385                 nver = get_num_ver(vn->vn_mode, tb, h,
1386                                    0, -1, h ? vn->vn_nr_item : 0, -1,
1387                                    snum012, NO_FLOW);
1388
1389                 if (!h) {
1390                         int nver1;
1391
1392                         /* note, that in this case we try to bottle between S[0] and S1 (S1 - the first new node) */
1393                         nver1 = get_num_ver(vn->vn_mode, tb, h,
1394                                             0, -1, 0, -1,
1395                                             snum012 + NOTHING_SHIFT_FLOW, FLOW);
1396                         if (nver > nver1)
1397                                 nset = NOTHING_SHIFT_FLOW, nver = nver1;
1398                 }
1399
1400                 /* calculate number of blocks S[h] must be split into when
1401                    l_shift_num first items and l_shift_bytes of the right most
1402                    liquid item to be shifted are shifted to the left neighbor,
1403                    as well as number of items in each part of the splitted node (s012 numbers),
1404                    and number of bytes (s1bytes) of the shared drop which flow to S1 if any
1405                  */
1406                 lset = LEFT_SHIFT_NO_FLOW;
1407                 lnver = get_num_ver(vn->vn_mode, tb, h,
1408                                     lpar - ((h || tb->lbytes == -1) ? 0 : 1),
1409                                     -1, h ? vn->vn_nr_item : 0, -1,
1410                                     snum012 + LEFT_SHIFT_NO_FLOW, NO_FLOW);
1411                 if (!h) {
1412                         int lnver1;
1413
1414                         lnver1 = get_num_ver(vn->vn_mode, tb, h,
1415                                              lpar -
1416                                              ((tb->lbytes != -1) ? 1 : 0),
1417                                              tb->lbytes, 0, -1,
1418                                              snum012 + LEFT_SHIFT_FLOW, FLOW);
1419                         if (lnver > lnver1)
1420                                 lset = LEFT_SHIFT_FLOW, lnver = lnver1;
1421                 }
1422
1423                 /* calculate number of blocks S[h] must be split into when
1424                    r_shift_num first items and r_shift_bytes of the left most
1425                    liquid item to be shifted are shifted to the right neighbor,
1426                    as well as number of items in each part of the splitted node (s012 numbers),
1427                    and number of bytes (s1bytes) of the shared drop which flow to S1 if any
1428                  */
1429                 rset = RIGHT_SHIFT_NO_FLOW;
1430                 rnver = get_num_ver(vn->vn_mode, tb, h,
1431                                     0, -1,
1432                                     h ? (vn->vn_nr_item - rpar) : (rpar -
1433                                                                    ((tb->
1434                                                                      rbytes !=
1435                                                                      -1) ? 1 :
1436                                                                     0)), -1,
1437                                     snum012 + RIGHT_SHIFT_NO_FLOW, NO_FLOW);
1438                 if (!h) {
1439                         int rnver1;
1440
1441                         rnver1 = get_num_ver(vn->vn_mode, tb, h,
1442                                              0, -1,
1443                                              (rpar -
1444                                               ((tb->rbytes != -1) ? 1 : 0)),
1445                                              tb->rbytes,
1446                                              snum012 + RIGHT_SHIFT_FLOW, FLOW);
1447
1448                         if (rnver > rnver1)
1449                                 rset = RIGHT_SHIFT_FLOW, rnver = rnver1;
1450                 }
1451
1452                 /* calculate number of blocks S[h] must be split into when
1453                    items are shifted in both directions,
1454                    as well as number of items in each part of the splitted node (s012 numbers),
1455                    and number of bytes (s1bytes) of the shared drop which flow to S1 if any
1456                  */
1457                 lrset = LR_SHIFT_NO_FLOW;
1458                 lrnver = get_num_ver(vn->vn_mode, tb, h,
1459                                      lpar - ((h || tb->lbytes == -1) ? 0 : 1),
1460                                      -1,
1461                                      h ? (vn->vn_nr_item - rpar) : (rpar -
1462                                                                     ((tb->
1463                                                                       rbytes !=
1464                                                                       -1) ? 1 :
1465                                                                      0)), -1,
1466                                      snum012 + LR_SHIFT_NO_FLOW, NO_FLOW);
1467                 if (!h) {
1468                         int lrnver1;
1469
1470                         lrnver1 = get_num_ver(vn->vn_mode, tb, h,
1471                                               lpar -
1472                                               ((tb->lbytes != -1) ? 1 : 0),
1473                                               tb->lbytes,
1474                                               (rpar -
1475                                                ((tb->rbytes != -1) ? 1 : 0)),
1476                                               tb->rbytes,
1477                                               snum012 + LR_SHIFT_FLOW, FLOW);
1478                         if (lrnver > lrnver1)
1479                                 lrset = LR_SHIFT_FLOW, lrnver = lrnver1;
1480                 }
1481
1482                 /* Our general shifting strategy is:
1483                    1) to minimized number of new nodes;
1484                    2) to minimized number of neighbors involved in shifting;
1485                    3) to minimized number of disk reads; */
1486
1487                 /* we can win TWO or ONE nodes by shifting in both directions */
1488                 if (lrnver < lnver && lrnver < rnver) {
1489                         RFALSE(h &&
1490                                (tb->lnum[h] != 1 ||
1491                                 tb->rnum[h] != 1 ||
1492                                 lrnver != 1 || rnver != 2 || lnver != 2
1493                                 || h != 1), "vs-8230: bad h");
1494                         if (lrset == LR_SHIFT_FLOW)
1495                                 set_parameters(tb, h, tb->lnum[h], tb->rnum[h],
1496                                                lrnver, snum012 + lrset,
1497                                                tb->lbytes, tb->rbytes);
1498                         else
1499                                 set_parameters(tb, h,
1500                                                tb->lnum[h] -
1501                                                ((tb->lbytes == -1) ? 0 : 1),
1502                                                tb->rnum[h] -
1503                                                ((tb->rbytes == -1) ? 0 : 1),
1504                                                lrnver, snum012 + lrset, -1, -1);
1505
1506                         return CARRY_ON;
1507                 }
1508
1509                 /* if shifting doesn't lead to better packing then don't shift */
1510                 if (nver == lrnver) {
1511                         set_parameters(tb, h, 0, 0, nver, snum012 + nset, -1,
1512                                        -1);
1513                         return CARRY_ON;
1514                 }
1515
1516                 /* now we know that for better packing shifting in only one
1517                    direction either to the left or to the right is required */
1518
1519                 /*  if shifting to the left is better than shifting to the right */
1520                 if (lnver < rnver) {
1521                         SET_PAR_SHIFT_LEFT;
1522                         return CARRY_ON;
1523                 }
1524
1525                 /* if shifting to the right is better than shifting to the left */
1526                 if (lnver > rnver) {
1527                         SET_PAR_SHIFT_RIGHT;
1528                         return CARRY_ON;
1529                 }
1530
1531                 /* now shifting in either direction gives the same number
1532                    of nodes and we can make use of the cached neighbors */
1533                 if (is_left_neighbor_in_cache(tb, h)) {
1534                         SET_PAR_SHIFT_LEFT;
1535                         return CARRY_ON;
1536                 }
1537
1538                 /* shift to the right independently on whether the right neighbor in cache or not */
1539                 SET_PAR_SHIFT_RIGHT;
1540                 return CARRY_ON;
1541         }
1542 }
1543
1544 /* Check whether current node S[h] is balanced when Decreasing its size by
1545  * Deleting or Cutting for INTERNAL node of S+tree.
1546  * Calculate parameters for balancing for current level h.
1547  * Parameters:
1548  *      tb      tree_balance structure;
1549  *      h       current level of the node;
1550  *      inum    item number in S[h];
1551  *      mode    i - insert, p - paste;
1552  * Returns:     1 - schedule occurred;
1553  *              0 - balancing for higher levels needed;
1554  *             -1 - no balancing for higher levels needed;
1555  *             -2 - no disk space.
1556  *
1557  * Note: Items of internal nodes have fixed size, so the balance condition for
1558  * the internal part of S+tree is as for the B-trees.
1559  */
1560 static int dc_check_balance_internal(struct tree_balance *tb, int h)
1561 {
1562         struct virtual_node *vn = tb->tb_vn;
1563
1564         /* Sh is the node whose balance is currently being checked,
1565            and Fh is its father.  */
1566         struct buffer_head *Sh, *Fh;
1567         int maxsize, ret;
1568         int lfree, rfree /* free space in L and R */ ;
1569
1570         Sh = PATH_H_PBUFFER(tb->tb_path, h);
1571         Fh = PATH_H_PPARENT(tb->tb_path, h);
1572
1573         maxsize = MAX_CHILD_SIZE(Sh);
1574
1575 /*   using tb->insert_size[h], which is negative in this case, create_virtual_node calculates: */
1576 /*   new_nr_item = number of items node would have if operation is */
1577 /*      performed without balancing (new_nr_item); */
1578         create_virtual_node(tb, h);
1579
1580         if (!Fh) {              /* S[h] is the root. */
1581                 if (vn->vn_nr_item > 0) {
1582                         set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1583                         return NO_BALANCING_NEEDED;     /* no balancing for higher levels needed */
1584                 }
1585                 /* new_nr_item == 0.
1586                  * Current root will be deleted resulting in
1587                  * decrementing the tree height. */
1588                 set_parameters(tb, h, 0, 0, 0, NULL, -1, -1);
1589                 return CARRY_ON;
1590         }
1591
1592         if ((ret = get_parents(tb, h)) != CARRY_ON)
1593                 return ret;
1594
1595         /* get free space of neighbors */
1596         rfree = get_rfree(tb, h);
1597         lfree = get_lfree(tb, h);
1598
1599         /* determine maximal number of items we can fit into neighbors */
1600         check_left(tb, h, lfree);
1601         check_right(tb, h, rfree);
1602
1603         if (vn->vn_nr_item >= MIN_NR_KEY(Sh)) { /* Balance condition for the internal node is valid.
1604                                                  * In this case we balance only if it leads to better packing. */
1605                 if (vn->vn_nr_item == MIN_NR_KEY(Sh)) { /* Here we join S[h] with one of its neighbors,
1606                                                          * which is impossible with greater values of new_nr_item. */
1607                         if (tb->lnum[h] >= vn->vn_nr_item + 1) {
1608                                 /* All contents of S[h] can be moved to L[h]. */
1609                                 int n;
1610                                 int order_L;
1611
1612                                 order_L =
1613                                     ((n =
1614                                       PATH_H_B_ITEM_ORDER(tb->tb_path,
1615                                                           h)) ==
1616                                      0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
1617                                 n = dc_size(B_N_CHILD(tb->FL[h], order_L)) /
1618                                     (DC_SIZE + KEY_SIZE);
1619                                 set_parameters(tb, h, -n - 1, 0, 0, NULL, -1,
1620                                                -1);
1621                                 return CARRY_ON;
1622                         }
1623
1624                         if (tb->rnum[h] >= vn->vn_nr_item + 1) {
1625                                 /* All contents of S[h] can be moved to R[h]. */
1626                                 int n;
1627                                 int order_R;
1628
1629                                 order_R =
1630                                     ((n =
1631                                       PATH_H_B_ITEM_ORDER(tb->tb_path,
1632                                                           h)) ==
1633                                      B_NR_ITEMS(Fh)) ? 0 : n + 1;
1634                                 n = dc_size(B_N_CHILD(tb->FR[h], order_R)) /
1635                                     (DC_SIZE + KEY_SIZE);
1636                                 set_parameters(tb, h, 0, -n - 1, 0, NULL, -1,
1637                                                -1);
1638                                 return CARRY_ON;
1639                         }
1640                 }
1641
1642                 if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) {
1643                         /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */
1644                         int to_r;
1645
1646                         to_r =
1647                             ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] -
1648                              tb->rnum[h] + vn->vn_nr_item + 1) / 2 -
1649                             (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]);
1650                         set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r,
1651                                        0, NULL, -1, -1);
1652                         return CARRY_ON;
1653                 }
1654
1655                 /* Balancing does not lead to better packing. */
1656                 set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1657                 return NO_BALANCING_NEEDED;
1658         }
1659
1660         /* Current node contain insufficient number of items. Balancing is required. */
1661         /* Check whether we can merge S[h] with left neighbor. */
1662         if (tb->lnum[h] >= vn->vn_nr_item + 1)
1663                 if (is_left_neighbor_in_cache(tb, h)
1664                     || tb->rnum[h] < vn->vn_nr_item + 1 || !tb->FR[h]) {
1665                         int n;
1666                         int order_L;
1667
1668                         order_L =
1669                             ((n =
1670                               PATH_H_B_ITEM_ORDER(tb->tb_path,
1671                                                   h)) ==
1672                              0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
1673                         n = dc_size(B_N_CHILD(tb->FL[h], order_L)) / (DC_SIZE +
1674                                                                       KEY_SIZE);
1675                         set_parameters(tb, h, -n - 1, 0, 0, NULL, -1, -1);
1676                         return CARRY_ON;
1677                 }
1678
1679         /* Check whether we can merge S[h] with right neighbor. */
1680         if (tb->rnum[h] >= vn->vn_nr_item + 1) {
1681                 int n;
1682                 int order_R;
1683
1684                 order_R =
1685                     ((n =
1686                       PATH_H_B_ITEM_ORDER(tb->tb_path,
1687                                           h)) == B_NR_ITEMS(Fh)) ? 0 : (n + 1);
1688                 n = dc_size(B_N_CHILD(tb->FR[h], order_R)) / (DC_SIZE +
1689                                                               KEY_SIZE);
1690                 set_parameters(tb, h, 0, -n - 1, 0, NULL, -1, -1);
1691                 return CARRY_ON;
1692         }
1693
1694         /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */
1695         if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) {
1696                 int to_r;
1697
1698                 to_r =
1699                     ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] +
1700                      vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 -
1701                                                 tb->rnum[h]);
1702                 set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL,
1703                                -1, -1);
1704                 return CARRY_ON;
1705         }
1706
1707         /* For internal nodes try to borrow item from a neighbor */
1708         RFALSE(!tb->FL[h] && !tb->FR[h], "vs-8235: trying to borrow for root");
1709
1710         /* Borrow one or two items from caching neighbor */
1711         if (is_left_neighbor_in_cache(tb, h) || !tb->FR[h]) {
1712                 int from_l;
1713
1714                 from_l =
1715                     (MAX_NR_KEY(Sh) + 1 - tb->lnum[h] + vn->vn_nr_item +
1716                      1) / 2 - (vn->vn_nr_item + 1);
1717                 set_parameters(tb, h, -from_l, 0, 1, NULL, -1, -1);
1718                 return CARRY_ON;
1719         }
1720
1721         set_parameters(tb, h, 0,
1722                        -((MAX_NR_KEY(Sh) + 1 - tb->rnum[h] + vn->vn_nr_item +
1723                           1) / 2 - (vn->vn_nr_item + 1)), 1, NULL, -1, -1);
1724         return CARRY_ON;
1725 }
1726
1727 /* Check whether current node S[h] is balanced when Decreasing its size by
1728  * Deleting or Truncating for LEAF node of S+tree.
1729  * Calculate parameters for balancing for current level h.
1730  * Parameters:
1731  *      tb      tree_balance structure;
1732  *      h       current level of the node;
1733  *      inum    item number in S[h];
1734  *      mode    i - insert, p - paste;
1735  * Returns:     1 - schedule occurred;
1736  *              0 - balancing for higher levels needed;
1737  *             -1 - no balancing for higher levels needed;
1738  *             -2 - no disk space.
1739  */
1740 static int dc_check_balance_leaf(struct tree_balance *tb, int h)
1741 {
1742         struct virtual_node *vn = tb->tb_vn;
1743
1744         /* Number of bytes that must be deleted from
1745            (value is negative if bytes are deleted) buffer which
1746            contains node being balanced.  The mnemonic is that the
1747            attempted change in node space used level is levbytes bytes. */
1748         int levbytes;
1749         /* the maximal item size */
1750         int maxsize, ret;
1751         /* S0 is the node whose balance is currently being checked,
1752            and F0 is its father.  */
1753         struct buffer_head *S0, *F0;
1754         int lfree, rfree /* free space in L and R */ ;
1755
1756         S0 = PATH_H_PBUFFER(tb->tb_path, 0);
1757         F0 = PATH_H_PPARENT(tb->tb_path, 0);
1758
1759         levbytes = tb->insert_size[h];
1760
1761         maxsize = MAX_CHILD_SIZE(S0);   /* maximal possible size of an item */
1762
1763         if (!F0) {              /* S[0] is the root now. */
1764
1765                 RFALSE(-levbytes >= maxsize - B_FREE_SPACE(S0),
1766                        "vs-8240: attempt to create empty buffer tree");
1767
1768                 set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1769                 return NO_BALANCING_NEEDED;
1770         }
1771
1772         if ((ret = get_parents(tb, h)) != CARRY_ON)
1773                 return ret;
1774
1775         /* get free space of neighbors */
1776         rfree = get_rfree(tb, h);
1777         lfree = get_lfree(tb, h);
1778
1779         create_virtual_node(tb, h);
1780
1781         /* if 3 leaves can be merge to one, set parameters and return */
1782         if (are_leaves_removable(tb, lfree, rfree))
1783                 return CARRY_ON;
1784
1785         /* determine maximal number of items we can shift to the left/right  neighbor
1786            and the maximal number of bytes that can flow to the left/right neighbor
1787            from the left/right most liquid item that cannot be shifted from S[0] entirely
1788          */
1789         check_left(tb, h, lfree);
1790         check_right(tb, h, rfree);
1791
1792         /* check whether we can merge S with left neighbor. */
1793         if (tb->lnum[0] >= vn->vn_nr_item && tb->lbytes == -1)
1794                 if (is_left_neighbor_in_cache(tb, h) || ((tb->rnum[0] - ((tb->rbytes == -1) ? 0 : 1)) < vn->vn_nr_item) ||      /* S can not be merged with R */
1795                     !tb->FR[h]) {
1796
1797                         RFALSE(!tb->FL[h],
1798                                "vs-8245: dc_check_balance_leaf: FL[h] must exist");
1799
1800                         /* set parameter to merge S[0] with its left neighbor */
1801                         set_parameters(tb, h, -1, 0, 0, NULL, -1, -1);
1802                         return CARRY_ON;
1803                 }
1804
1805         /* check whether we can merge S[0] with right neighbor. */
1806         if (tb->rnum[0] >= vn->vn_nr_item && tb->rbytes == -1) {
1807                 set_parameters(tb, h, 0, -1, 0, NULL, -1, -1);
1808                 return CARRY_ON;
1809         }
1810
1811         /* All contents of S[0] can be moved to the neighbors (L[0] & R[0]). Set parameters and return */
1812         if (is_leaf_removable(tb))
1813                 return CARRY_ON;
1814
1815         /* Balancing is not required. */
1816         tb->s0num = vn->vn_nr_item;
1817         set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1818         return NO_BALANCING_NEEDED;
1819 }
1820
1821 /* Check whether current node S[h] is balanced when Decreasing its size by
1822  * Deleting or Cutting.
1823  * Calculate parameters for balancing for current level h.
1824  * Parameters:
1825  *      tb      tree_balance structure;
1826  *      h       current level of the node;
1827  *      inum    item number in S[h];
1828  *      mode    d - delete, c - cut.
1829  * Returns:     1 - schedule occurred;
1830  *              0 - balancing for higher levels needed;
1831  *             -1 - no balancing for higher levels needed;
1832  *             -2 - no disk space.
1833  */
1834 static int dc_check_balance(struct tree_balance *tb, int h)
1835 {
1836         RFALSE(!(PATH_H_PBUFFER(tb->tb_path, h)),
1837                "vs-8250: S is not initialized");
1838
1839         if (h)
1840                 return dc_check_balance_internal(tb, h);
1841         else
1842                 return dc_check_balance_leaf(tb, h);
1843 }
1844
1845 /* Check whether current node S[h] is balanced.
1846  * Calculate parameters for balancing for current level h.
1847  * Parameters:
1848  *
1849  *      tb      tree_balance structure:
1850  *
1851  *              tb is a large structure that must be read about in the header file
1852  *              at the same time as this procedure if the reader is to successfully
1853  *              understand this procedure
1854  *
1855  *      h       current level of the node;
1856  *      inum    item number in S[h];
1857  *      mode    i - insert, p - paste, d - delete, c - cut.
1858  * Returns:     1 - schedule occurred;
1859  *              0 - balancing for higher levels needed;
1860  *             -1 - no balancing for higher levels needed;
1861  *             -2 - no disk space.
1862  */
1863 static int check_balance(int mode,
1864                          struct tree_balance *tb,
1865                          int h,
1866                          int inum,
1867                          int pos_in_item,
1868                          struct item_head *ins_ih, const void *data)
1869 {
1870         struct virtual_node *vn;
1871
1872         vn = tb->tb_vn = (struct virtual_node *)(tb->vn_buf);
1873         vn->vn_free_ptr = (char *)(tb->tb_vn + 1);
1874         vn->vn_mode = mode;
1875         vn->vn_affected_item_num = inum;
1876         vn->vn_pos_in_item = pos_in_item;
1877         vn->vn_ins_ih = ins_ih;
1878         vn->vn_data = data;
1879
1880         RFALSE(mode == M_INSERT && !vn->vn_ins_ih,
1881                "vs-8255: ins_ih can not be 0 in insert mode");
1882
1883         if (tb->insert_size[h] > 0)
1884                 /* Calculate balance parameters when size of node is increasing. */
1885                 return ip_check_balance(tb, h);
1886
1887         /* Calculate balance parameters when  size of node is decreasing. */
1888         return dc_check_balance(tb, h);
1889 }
1890
1891 /* Check whether parent at the path is the really parent of the current node.*/
1892 static int get_direct_parent(struct tree_balance *tb, int h)
1893 {
1894         struct buffer_head *bh;
1895         struct treepath *path = tb->tb_path;
1896         int position,
1897             path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h);
1898
1899         /* We are in the root or in the new root. */
1900         if (path_offset <= FIRST_PATH_ELEMENT_OFFSET) {
1901
1902                 RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET - 1,
1903                        "PAP-8260: invalid offset in the path");
1904
1905                 if (PATH_OFFSET_PBUFFER(path, FIRST_PATH_ELEMENT_OFFSET)->
1906                     b_blocknr == SB_ROOT_BLOCK(tb->tb_sb)) {
1907                         /* Root is not changed. */
1908                         PATH_OFFSET_PBUFFER(path, path_offset - 1) = NULL;
1909                         PATH_OFFSET_POSITION(path, path_offset - 1) = 0;
1910                         return CARRY_ON;
1911                 }
1912                 return REPEAT_SEARCH;   /* Root is changed and we must recalculate the path. */
1913         }
1914
1915         if (!B_IS_IN_TREE
1916             (bh = PATH_OFFSET_PBUFFER(path, path_offset - 1)))
1917                 return REPEAT_SEARCH;   /* Parent in the path is not in the tree. */
1918
1919         if ((position =
1920              PATH_OFFSET_POSITION(path,
1921                                   path_offset - 1)) > B_NR_ITEMS(bh))
1922                 return REPEAT_SEARCH;
1923
1924         if (B_N_CHILD_NUM(bh, position) !=
1925             PATH_OFFSET_PBUFFER(path, path_offset)->b_blocknr)
1926                 /* Parent in the path is not parent of the current node in the tree. */
1927                 return REPEAT_SEARCH;
1928
1929         if (buffer_locked(bh)) {
1930                 __wait_on_buffer(bh);
1931                 if (FILESYSTEM_CHANGED_TB(tb))
1932                         return REPEAT_SEARCH;
1933         }
1934
1935         return CARRY_ON;        /* Parent in the path is unlocked and really parent of the current node.  */
1936 }
1937
1938 /* Using lnum[h] and rnum[h] we should determine what neighbors
1939  * of S[h] we
1940  * need in order to balance S[h], and get them if necessary.
1941  * Returns:     SCHEDULE_OCCURRED - schedule occurred while the function worked;
1942  *              CARRY_ON - schedule didn't occur while the function worked;
1943  */
1944 static int get_neighbors(struct tree_balance *tb, int h)
1945 {
1946         int child_position,
1947             path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h + 1);
1948         unsigned long son_number;
1949         struct super_block *sb = tb->tb_sb;
1950         struct buffer_head *bh;
1951
1952         PROC_INFO_INC(sb, get_neighbors[h]);
1953
1954         if (tb->lnum[h]) {
1955                 /* We need left neighbor to balance S[h]. */
1956                 PROC_INFO_INC(sb, need_l_neighbor[h]);
1957                 bh = PATH_OFFSET_PBUFFER(tb->tb_path, path_offset);
1958
1959                 RFALSE(bh == tb->FL[h] &&
1960                        !PATH_OFFSET_POSITION(tb->tb_path, path_offset),
1961                        "PAP-8270: invalid position in the parent");
1962
1963                 child_position =
1964                     (bh ==
1965                      tb->FL[h]) ? tb->lkey[h] : B_NR_ITEMS(tb->
1966                                                                        FL[h]);
1967                 son_number = B_N_CHILD_NUM(tb->FL[h], child_position);
1968                 bh = sb_bread(sb, son_number);
1969                 if (!bh)
1970                         return IO_ERROR;
1971                 if (FILESYSTEM_CHANGED_TB(tb)) {
1972                         brelse(bh);
1973                         PROC_INFO_INC(sb, get_neighbors_restart[h]);
1974                         return REPEAT_SEARCH;
1975                 }
1976
1977                 RFALSE(!B_IS_IN_TREE(tb->FL[h]) ||
1978                        child_position > B_NR_ITEMS(tb->FL[h]) ||
1979                        B_N_CHILD_NUM(tb->FL[h], child_position) !=
1980                        bh->b_blocknr, "PAP-8275: invalid parent");
1981                 RFALSE(!B_IS_IN_TREE(bh), "PAP-8280: invalid child");
1982                 RFALSE(!h &&
1983                        B_FREE_SPACE(bh) !=
1984                        MAX_CHILD_SIZE(bh) -
1985                        dc_size(B_N_CHILD(tb->FL[0], child_position)),
1986                        "PAP-8290: invalid child size of left neighbor");
1987
1988                 brelse(tb->L[h]);
1989                 tb->L[h] = bh;
1990         }
1991
1992         /* We need right neighbor to balance S[path_offset]. */
1993         if (tb->rnum[h]) {      /* We need right neighbor to balance S[path_offset]. */
1994                 PROC_INFO_INC(sb, need_r_neighbor[h]);
1995                 bh = PATH_OFFSET_PBUFFER(tb->tb_path, path_offset);
1996
1997                 RFALSE(bh == tb->FR[h] &&
1998                        PATH_OFFSET_POSITION(tb->tb_path,
1999                                             path_offset) >=
2000                        B_NR_ITEMS(bh),
2001                        "PAP-8295: invalid position in the parent");
2002
2003                 child_position =
2004                     (bh == tb->FR[h]) ? tb->rkey[h] + 1 : 0;
2005                 son_number = B_N_CHILD_NUM(tb->FR[h], child_position);
2006                 bh = sb_bread(sb, son_number);
2007                 if (!bh)
2008                         return IO_ERROR;
2009                 if (FILESYSTEM_CHANGED_TB(tb)) {
2010                         brelse(bh);
2011                         PROC_INFO_INC(sb, get_neighbors_restart[h]);
2012                         return REPEAT_SEARCH;
2013                 }
2014                 brelse(tb->R[h]);
2015                 tb->R[h] = bh;
2016
2017                 RFALSE(!h
2018                        && B_FREE_SPACE(bh) !=
2019                        MAX_CHILD_SIZE(bh) -
2020                        dc_size(B_N_CHILD(tb->FR[0], child_position)),
2021                        "PAP-8300: invalid child size of right neighbor (%d != %d - %d)",
2022                        B_FREE_SPACE(bh), MAX_CHILD_SIZE(bh),
2023                        dc_size(B_N_CHILD(tb->FR[0], child_position)));
2024
2025         }
2026         return CARRY_ON;
2027 }
2028
2029 static int get_virtual_node_size(struct super_block *sb, struct buffer_head *bh)
2030 {
2031         int max_num_of_items;
2032         int max_num_of_entries;
2033         unsigned long blocksize = sb->s_blocksize;
2034
2035 #define MIN_NAME_LEN 1
2036
2037         max_num_of_items = (blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN);
2038         max_num_of_entries = (blocksize - BLKH_SIZE - IH_SIZE) /
2039             (DEH_SIZE + MIN_NAME_LEN);
2040
2041         return sizeof(struct virtual_node) +
2042             max(max_num_of_items * sizeof(struct virtual_item),
2043                 sizeof(struct virtual_item) + sizeof(struct direntry_uarea) +
2044                 (max_num_of_entries - 1) * sizeof(__u16));
2045 }
2046
2047 /* maybe we should fail balancing we are going to perform when kmalloc
2048    fails several times. But now it will loop until kmalloc gets
2049    required memory */
2050 static int get_mem_for_virtual_node(struct tree_balance *tb)
2051 {
2052         int check_fs = 0;
2053         int size;
2054         char *buf;
2055
2056         size = get_virtual_node_size(tb->tb_sb, PATH_PLAST_BUFFER(tb->tb_path));
2057
2058         if (size > tb->vn_buf_size) {
2059                 /* we have to allocate more memory for virtual node */
2060                 if (tb->vn_buf) {
2061                         /* free memory allocated before */
2062                         kfree(tb->vn_buf);
2063                         /* this is not needed if kfree is atomic */
2064                         check_fs = 1;
2065                 }
2066
2067                 /* virtual node requires now more memory */
2068                 tb->vn_buf_size = size;
2069
2070                 /* get memory for virtual item */
2071                 buf = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN);
2072                 if (!buf) {
2073                         /* getting memory with GFP_KERNEL priority may involve
2074                            balancing now (due to indirect_to_direct conversion on
2075                            dcache shrinking). So, release path and collected
2076                            resources here */
2077                         free_buffers_in_tb(tb);
2078                         buf = kmalloc(size, GFP_NOFS);
2079                         if (!buf) {
2080                                 tb->vn_buf_size = 0;
2081                         }
2082                         tb->vn_buf = buf;
2083                         schedule();
2084                         return REPEAT_SEARCH;
2085                 }
2086
2087                 tb->vn_buf = buf;
2088         }
2089
2090         if (check_fs && FILESYSTEM_CHANGED_TB(tb))
2091                 return REPEAT_SEARCH;
2092
2093         return CARRY_ON;
2094 }
2095
2096 #ifdef CONFIG_REISERFS_CHECK
2097 static void tb_buffer_sanity_check(struct super_block *sb,
2098                                    struct buffer_head *bh,
2099                                    const char *descr, int level)
2100 {
2101         if (bh) {
2102                 if (atomic_read(&(bh->b_count)) <= 0)
2103
2104                         reiserfs_panic(sb, "jmacd-1", "negative or zero "
2105                                        "reference counter for buffer %s[%d] "
2106                                        "(%b)", descr, level, bh);
2107
2108                 if (!buffer_uptodate(bh))
2109                         reiserfs_panic(sb, "jmacd-2", "buffer is not up "
2110                                        "to date %s[%d] (%b)",
2111                                        descr, level, bh);
2112
2113                 if (!B_IS_IN_TREE(bh))
2114                         reiserfs_panic(sb, "jmacd-3", "buffer is not "
2115                                        "in tree %s[%d] (%b)",
2116                                        descr, level, bh);
2117
2118                 if (bh->b_bdev != sb->s_bdev)
2119                         reiserfs_panic(sb, "jmacd-4", "buffer has wrong "
2120                                        "device %s[%d] (%b)",
2121                                        descr, level, bh);
2122
2123                 if (bh->b_size != sb->s_blocksize)
2124                         reiserfs_panic(sb, "jmacd-5", "buffer has wrong "
2125                                        "blocksize %s[%d] (%b)",
2126                                        descr, level, bh);
2127
2128                 if (bh->b_blocknr > SB_BLOCK_COUNT(sb))
2129                         reiserfs_panic(sb, "jmacd-6", "buffer block "
2130                                        "number too high %s[%d] (%b)",
2131                                        descr, level, bh);
2132         }
2133 }
2134 #else
2135 static void tb_buffer_sanity_check(struct super_block *sb,
2136                                    struct buffer_head *bh,
2137                                    const char *descr, int level)
2138 {;
2139 }
2140 #endif
2141
2142 static int clear_all_dirty_bits(struct super_block *s, struct buffer_head *bh)
2143 {
2144         return reiserfs_prepare_for_journal(s, bh, 0);
2145 }
2146
2147 static int wait_tb_buffers_until_unlocked(struct tree_balance *tb)
2148 {
2149         struct buffer_head *locked;
2150 #ifdef CONFIG_REISERFS_CHECK
2151         int repeat_counter = 0;
2152 #endif
2153         int i;
2154
2155         do {
2156
2157                 locked = NULL;
2158
2159                 for (i = tb->tb_path->path_length;
2160                      !locked && i > ILLEGAL_PATH_ELEMENT_OFFSET; i--) {
2161                         if (PATH_OFFSET_PBUFFER(tb->tb_path, i)) {
2162                                 /* if I understand correctly, we can only be sure the last buffer
2163                                  ** in the path is in the tree --clm
2164                                  */
2165 #ifdef CONFIG_REISERFS_CHECK
2166                                 if (PATH_PLAST_BUFFER(tb->tb_path) ==
2167                                     PATH_OFFSET_PBUFFER(tb->tb_path, i))
2168                                         tb_buffer_sanity_check(tb->tb_sb,
2169                                                                PATH_OFFSET_PBUFFER
2170                                                                (tb->tb_path,
2171                                                                 i), "S",
2172                                                                tb->tb_path->
2173                                                                path_length - i);
2174 #endif
2175                                 if (!clear_all_dirty_bits(tb->tb_sb,
2176                                                           PATH_OFFSET_PBUFFER
2177                                                           (tb->tb_path,
2178                                                            i))) {
2179                                         locked =
2180                                             PATH_OFFSET_PBUFFER(tb->tb_path,
2181                                                                 i);
2182                                 }
2183                         }
2184                 }
2185
2186                 for (i = 0; !locked && i < MAX_HEIGHT && tb->insert_size[i];
2187                      i++) {
2188
2189                         if (tb->lnum[i]) {
2190
2191                                 if (tb->L[i]) {
2192                                         tb_buffer_sanity_check(tb->tb_sb,
2193                                                                tb->L[i],
2194                                                                "L", i);
2195                                         if (!clear_all_dirty_bits
2196                                             (tb->tb_sb, tb->L[i]))
2197                                                 locked = tb->L[i];
2198                                 }
2199
2200                                 if (!locked && tb->FL[i]) {
2201                                         tb_buffer_sanity_check(tb->tb_sb,
2202                                                                tb->FL[i],
2203                                                                "FL", i);
2204                                         if (!clear_all_dirty_bits
2205                                             (tb->tb_sb, tb->FL[i]))
2206                                                 locked = tb->FL[i];
2207                                 }
2208
2209                                 if (!locked && tb->CFL[i]) {
2210                                         tb_buffer_sanity_check(tb->tb_sb,
2211                                                                tb->CFL[i],
2212                                                                "CFL", i);
2213                                         if (!clear_all_dirty_bits
2214                                             (tb->tb_sb, tb->CFL[i]))
2215                                                 locked = tb->CFL[i];
2216                                 }
2217
2218                         }
2219
2220                         if (!locked && (tb->rnum[i])) {
2221
2222                                 if (tb->R[i]) {
2223                                         tb_buffer_sanity_check(tb->tb_sb,
2224                                                                tb->R[i],
2225                                                                "R", i);
2226                                         if (!clear_all_dirty_bits
2227                                             (tb->tb_sb, tb->R[i]))
2228                                                 locked = tb->R[i];
2229                                 }
2230
2231                                 if (!locked && tb->FR[i]) {
2232                                         tb_buffer_sanity_check(tb->tb_sb,
2233                                                                tb->FR[i],
2234                                                                "FR", i);
2235                                         if (!clear_all_dirty_bits
2236                                             (tb->tb_sb, tb->FR[i]))
2237                                                 locked = tb->FR[i];
2238                                 }
2239
2240                                 if (!locked && tb->CFR[i]) {
2241                                         tb_buffer_sanity_check(tb->tb_sb,
2242                                                                tb->CFR[i],
2243                                                                "CFR", i);
2244                                         if (!clear_all_dirty_bits
2245                                             (tb->tb_sb, tb->CFR[i]))
2246                                                 locked = tb->CFR[i];
2247                                 }
2248                         }
2249                 }
2250                 /* as far as I can tell, this is not required.  The FEB list seems
2251                  ** to be full of newly allocated nodes, which will never be locked,
2252                  ** dirty, or anything else.
2253                  ** To be safe, I'm putting in the checks and waits in.  For the moment,
2254                  ** they are needed to keep the code in journal.c from complaining
2255                  ** about the buffer.  That code is inside CONFIG_REISERFS_CHECK as well.
2256                  ** --clm
2257                  */
2258                 for (i = 0; !locked && i < MAX_FEB_SIZE; i++) {
2259                         if (tb->FEB[i]) {
2260                                 if (!clear_all_dirty_bits
2261                                     (tb->tb_sb, tb->FEB[i]))
2262                                         locked = tb->FEB[i];
2263                         }
2264                 }
2265
2266                 if (locked) {
2267 #ifdef CONFIG_REISERFS_CHECK
2268                         repeat_counter++;
2269                         if ((repeat_counter % 10000) == 0) {
2270                                 reiserfs_warning(tb->tb_sb, "reiserfs-8200",
2271                                                  "too many iterations waiting "
2272                                                  "for buffer to unlock "
2273                                                  "(%b)", locked);
2274
2275                                 /* Don't loop forever.  Try to recover from possible error. */
2276
2277                                 return (FILESYSTEM_CHANGED_TB(tb)) ?
2278                                     REPEAT_SEARCH : CARRY_ON;
2279                         }
2280 #endif
2281                         __wait_on_buffer(locked);
2282                         if (FILESYSTEM_CHANGED_TB(tb))
2283                                 return REPEAT_SEARCH;
2284                 }
2285
2286         } while (locked);
2287
2288         return CARRY_ON;
2289 }
2290
2291 /* Prepare for balancing, that is
2292  *      get all necessary parents, and neighbors;
2293  *      analyze what and where should be moved;
2294  *      get sufficient number of new nodes;
2295  * Balancing will start only after all resources will be collected at a time.
2296  *
2297  * When ported to SMP kernels, only at the last moment after all needed nodes
2298  * are collected in cache, will the resources be locked using the usual
2299  * textbook ordered lock acquisition algorithms.  Note that ensuring that
2300  * this code neither write locks what it does not need to write lock nor locks out of order
2301  * will be a pain in the butt that could have been avoided.  Grumble grumble. -Hans
2302  *
2303  * fix is meant in the sense of render unchanging
2304  *
2305  * Latency might be improved by first gathering a list of what buffers are needed
2306  * and then getting as many of them in parallel as possible? -Hans
2307  *
2308  * Parameters:
2309  *      op_mode i - insert, d - delete, c - cut (truncate), p - paste (append)
2310  *      tb      tree_balance structure;
2311  *      inum    item number in S[h];
2312  *      pos_in_item - comment this if you can
2313  *      ins_ih  item head of item being inserted
2314  *      data    inserted item or data to be pasted
2315  * Returns:     1 - schedule occurred while the function worked;
2316  *              0 - schedule didn't occur while the function worked;
2317  *             -1 - if no_disk_space
2318  */
2319
2320 int fix_nodes(int op_mode, struct tree_balance *tb,
2321               struct item_head *ins_ih, const void *data)
2322 {
2323         int ret, h, item_num = PATH_LAST_POSITION(tb->tb_path);
2324         int pos_in_item;
2325
2326         /* we set wait_tb_buffers_run when we have to restore any dirty bits cleared
2327          ** during wait_tb_buffers_run
2328          */
2329         int wait_tb_buffers_run = 0;
2330         struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
2331
2332         ++REISERFS_SB(tb->tb_sb)->s_fix_nodes;
2333
2334         pos_in_item = tb->tb_path->pos_in_item;
2335
2336         tb->fs_gen = get_generation(tb->tb_sb);
2337
2338         /* we prepare and log the super here so it will already be in the
2339          ** transaction when do_balance needs to change it.
2340          ** This way do_balance won't have to schedule when trying to prepare
2341          ** the super for logging
2342          */
2343         reiserfs_prepare_for_journal(tb->tb_sb,
2344                                      SB_BUFFER_WITH_SB(tb->tb_sb), 1);
2345         journal_mark_dirty(tb->transaction_handle, tb->tb_sb,
2346                            SB_BUFFER_WITH_SB(tb->tb_sb));
2347         if (FILESYSTEM_CHANGED_TB(tb))
2348                 return REPEAT_SEARCH;
2349
2350         /* if it possible in indirect_to_direct conversion */
2351         if (buffer_locked(tbS0)) {
2352                 __wait_on_buffer(tbS0);
2353                 if (FILESYSTEM_CHANGED_TB(tb))
2354                         return REPEAT_SEARCH;
2355         }
2356 #ifdef CONFIG_REISERFS_CHECK
2357         if (cur_tb) {
2358                 print_cur_tb("fix_nodes");
2359                 reiserfs_panic(tb->tb_sb, "PAP-8305",
2360                                "there is pending do_balance");
2361         }
2362
2363         if (!buffer_uptodate(tbS0) || !B_IS_IN_TREE(tbS0))
2364                 reiserfs_panic(tb->tb_sb, "PAP-8320", "S[0] (%b %z) is "
2365                                "not uptodate at the beginning of fix_nodes "
2366                                "or not in tree (mode %c)",
2367                                tbS0, tbS0, op_mode);
2368
2369         /* Check parameters. */
2370         switch (op_mode) {
2371         case M_INSERT:
2372                 if (item_num <= 0 || item_num > B_NR_ITEMS(tbS0))
2373                         reiserfs_panic(tb->tb_sb, "PAP-8330", "Incorrect "
2374                                        "item number %d (in S0 - %d) in case "
2375                                        "of insert", item_num,
2376                                        B_NR_ITEMS(tbS0));
2377                 break;
2378         case M_PASTE:
2379         case M_DELETE:
2380         case M_CUT:
2381                 if (item_num < 0 || item_num >= B_NR_ITEMS(tbS0)) {
2382                         print_block(tbS0, 0, -1, -1);
2383                         reiserfs_panic(tb->tb_sb, "PAP-8335", "Incorrect "
2384                                        "item number(%d); mode = %c "
2385                                        "insert_size = %d",
2386                                        item_num, op_mode,
2387                                        tb->insert_size[0]);
2388                 }
2389                 break;
2390         default:
2391                 reiserfs_panic(tb->tb_sb, "PAP-8340", "Incorrect mode "
2392                                "of operation");
2393         }
2394 #endif
2395
2396         if (get_mem_for_virtual_node(tb) == REPEAT_SEARCH)
2397                 // FIXME: maybe -ENOMEM when tb->vn_buf == 0? Now just repeat
2398                 return REPEAT_SEARCH;
2399
2400         /* Starting from the leaf level; for all levels h of the tree. */
2401         for (h = 0; h < MAX_HEIGHT && tb->insert_size[h]; h++) {
2402                 ret = get_direct_parent(tb, h);
2403                 if (ret != CARRY_ON)
2404                         goto repeat;
2405
2406                 ret = check_balance(op_mode, tb, h, item_num,
2407                                     pos_in_item, ins_ih, data);
2408                 if (ret != CARRY_ON) {
2409                         if (ret == NO_BALANCING_NEEDED) {
2410                                 /* No balancing for higher levels needed. */
2411                                 ret = get_neighbors(tb, h);
2412                                 if (ret != CARRY_ON)
2413                                         goto repeat;
2414                                 if (h != MAX_HEIGHT - 1)
2415                                         tb->insert_size[h + 1] = 0;
2416                                 /* ok, analysis and resource gathering are complete */
2417                                 break;
2418                         }
2419                         goto repeat;
2420                 }
2421
2422                 ret = get_neighbors(tb, h);
2423                 if (ret != CARRY_ON)
2424                         goto repeat;
2425
2426                 /* No disk space, or schedule occurred and analysis may be
2427                  * invalid and needs to be redone. */
2428                 ret = get_empty_nodes(tb, h);
2429                 if (ret != CARRY_ON)
2430                         goto repeat;
2431
2432                 if (!PATH_H_PBUFFER(tb->tb_path, h)) {
2433                         /* We have a positive insert size but no nodes exist on this
2434                            level, this means that we are creating a new root. */
2435
2436                         RFALSE(tb->blknum[h] != 1,
2437                                "PAP-8350: creating new empty root");
2438
2439                         if (h < MAX_HEIGHT - 1)
2440                                 tb->insert_size[h + 1] = 0;
2441                 } else if (!PATH_H_PBUFFER(tb->tb_path, h + 1)) {
2442                         if (tb->blknum[h] > 1) {
2443                                 /* The tree needs to be grown, so this node S[h]
2444                                    which is the root node is split into two nodes,
2445                                    and a new node (S[h+1]) will be created to
2446                                    become the root node.  */
2447
2448                                 RFALSE(h == MAX_HEIGHT - 1,
2449                                        "PAP-8355: attempt to create too high of a tree");
2450
2451                                 tb->insert_size[h + 1] =
2452                                     (DC_SIZE +
2453                                      KEY_SIZE) * (tb->blknum[h] - 1) +
2454                                     DC_SIZE;
2455                         } else if (h < MAX_HEIGHT - 1)
2456                                 tb->insert_size[h + 1] = 0;
2457                 } else
2458                         tb->insert_size[h + 1] =
2459                             (DC_SIZE + KEY_SIZE) * (tb->blknum[h] - 1);
2460         }
2461
2462         ret = wait_tb_buffers_until_unlocked(tb);
2463         if (ret == CARRY_ON) {
2464                 if (FILESYSTEM_CHANGED_TB(tb)) {
2465                         wait_tb_buffers_run = 1;
2466                         ret = REPEAT_SEARCH;
2467                         goto repeat;
2468                 } else {
2469                         return CARRY_ON;
2470                 }
2471         } else {
2472                 wait_tb_buffers_run = 1;
2473                 goto repeat;
2474         }
2475
2476       repeat:
2477         // fix_nodes was unable to perform its calculation due to
2478         // filesystem got changed under us, lack of free disk space or i/o
2479         // failure. If the first is the case - the search will be
2480         // repeated. For now - free all resources acquired so far except
2481         // for the new allocated nodes
2482         {
2483                 int i;
2484
2485                 /* Release path buffers. */
2486                 if (wait_tb_buffers_run) {
2487                         pathrelse_and_restore(tb->tb_sb, tb->tb_path);
2488                 } else {
2489                         pathrelse(tb->tb_path);
2490                 }
2491                 /* brelse all resources collected for balancing */
2492                 for (i = 0; i < MAX_HEIGHT; i++) {
2493                         if (wait_tb_buffers_run) {
2494                                 reiserfs_restore_prepared_buffer(tb->tb_sb,
2495                                                                  tb->L[i]);
2496                                 reiserfs_restore_prepared_buffer(tb->tb_sb,
2497                                                                  tb->R[i]);
2498                                 reiserfs_restore_prepared_buffer(tb->tb_sb,
2499                                                                  tb->FL[i]);
2500                                 reiserfs_restore_prepared_buffer(tb->tb_sb,
2501                                                                  tb->FR[i]);
2502                                 reiserfs_restore_prepared_buffer(tb->tb_sb,
2503                                                                  tb->
2504                                                                  CFL[i]);
2505                                 reiserfs_restore_prepared_buffer(tb->tb_sb,
2506                                                                  tb->
2507                                                                  CFR[i]);
2508                         }
2509
2510                         brelse(tb->L[i]);
2511                         brelse(tb->R[i]);
2512                         brelse(tb->FL[i]);
2513                         brelse(tb->FR[i]);
2514                         brelse(tb->CFL[i]);
2515                         brelse(tb->CFR[i]);
2516
2517                         tb->L[i] = NULL;
2518                         tb->R[i] = NULL;
2519                         tb->FL[i] = NULL;
2520                         tb->FR[i] = NULL;
2521                         tb->CFL[i] = NULL;
2522                         tb->CFR[i] = NULL;
2523                 }
2524
2525                 if (wait_tb_buffers_run) {
2526                         for (i = 0; i < MAX_FEB_SIZE; i++) {
2527                                 if (tb->FEB[i])
2528                                         reiserfs_restore_prepared_buffer
2529                                             (tb->tb_sb, tb->FEB[i]);
2530                         }
2531                 }
2532                 return ret;
2533         }
2534
2535 }
2536
2537 /* Anatoly will probably forgive me renaming tb to tb. I just
2538    wanted to make lines shorter */
2539 void unfix_nodes(struct tree_balance *tb)
2540 {
2541         int i;
2542
2543         /* Release path buffers. */
2544         pathrelse_and_restore(tb->tb_sb, tb->tb_path);
2545
2546         /* brelse all resources collected for balancing */
2547         for (i = 0; i < MAX_HEIGHT; i++) {
2548                 reiserfs_restore_prepared_buffer(tb->tb_sb, tb->L[i]);
2549                 reiserfs_restore_prepared_buffer(tb->tb_sb, tb->R[i]);
2550                 reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FL[i]);
2551                 reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FR[i]);
2552                 reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFL[i]);
2553                 reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFR[i]);
2554
2555                 brelse(tb->L[i]);
2556                 brelse(tb->R[i]);
2557                 brelse(tb->FL[i]);
2558                 brelse(tb->FR[i]);
2559                 brelse(tb->CFL[i]);
2560                 brelse(tb->CFR[i]);
2561         }
2562
2563         /* deal with list of allocated (used and unused) nodes */
2564         for (i = 0; i < MAX_FEB_SIZE; i++) {
2565                 if (tb->FEB[i]) {
2566                         b_blocknr_t blocknr = tb->FEB[i]->b_blocknr;
2567                         /* de-allocated block which was not used by balancing and
2568                            bforget about buffer for it */
2569                         brelse(tb->FEB[i]);
2570                         reiserfs_free_block(tb->transaction_handle, NULL,
2571                                             blocknr, 0);
2572                 }
2573                 if (tb->used[i]) {
2574                         /* release used as new nodes including a new root */
2575                         brelse(tb->used[i]);
2576                 }
2577         }
2578
2579         kfree(tb->vn_buf);
2580
2581 }