Merge upstream 2.6.13-rc3 into ieee80211 branch of netdev-2.6.
[linux-2.6] / fs / reiserfs / fix_node.c
1 /*
2  * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3  */
4
5 /**
6  ** old_item_num
7  ** old_entry_num
8  ** set_entry_sizes
9  ** create_virtual_node
10  ** check_left
11  ** check_right
12  ** directory_part_size
13  ** get_num_ver
14  ** set_parameters
15  ** is_leaf_removable
16  ** are_leaves_removable
17  ** get_empty_nodes
18  ** get_lfree
19  ** get_rfree
20  ** is_left_neighbor_in_cache
21  ** decrement_key
22  ** get_far_parent
23  ** get_parents
24  ** can_node_be_removed
25  ** ip_check_balance
26  ** dc_check_balance_internal
27  ** dc_check_balance_leaf
28  ** dc_check_balance
29  ** check_balance
30  ** get_direct_parent
31  ** get_neighbors
32  ** fix_nodes
33  ** 
34  ** 
35  **/
36
37 #include <linux/config.h>
38 #include <linux/time.h>
39 #include <linux/string.h>
40 #include <linux/reiserfs_fs.h>
41 #include <linux/buffer_head.h>
42
43 /* To make any changes in the tree we find a node, that contains item
44    to be changed/deleted or position in the node we insert a new item
45    to. We call this node S. To do balancing we need to decide what we
46    will shift to left/right neighbor, or to a new node, where new item
47    will be etc. To make this analysis simpler we build virtual
48    node. Virtual node is an array of items, that will replace items of
49    node S. (For instance if we are going to delete an item, virtual
50    node does not contain it). Virtual node keeps information about
51    item sizes and types, mergeability of first and last items, sizes
52    of all entries in directory item. We use this array of items when
53    calculating what we can shift to neighbors and how many nodes we
54    have to have if we do not any shiftings, if we shift to left/right
55    neighbor or to both. */
56
57 /* taking item number in virtual node, returns number of item, that it has in source buffer */
58 static inline int old_item_num(int new_num, int affected_item_num, int mode)
59 {
60         if (mode == M_PASTE || mode == M_CUT || new_num < affected_item_num)
61                 return new_num;
62
63         if (mode == M_INSERT) {
64
65                 RFALSE(new_num == 0,
66                        "vs-8005: for INSERT mode and item number of inserted item");
67
68                 return new_num - 1;
69         }
70
71         RFALSE(mode != M_DELETE,
72                "vs-8010: old_item_num: mode must be M_DELETE (mode = \'%c\'",
73                mode);
74         /* delete mode */
75         return new_num + 1;
76 }
77
78 static void create_virtual_node(struct tree_balance *tb, int h)
79 {
80         struct item_head *ih;
81         struct virtual_node *vn = tb->tb_vn;
82         int new_num;
83         struct buffer_head *Sh; /* this comes from tb->S[h] */
84
85         Sh = PATH_H_PBUFFER(tb->tb_path, h);
86
87         /* size of changed node */
88         vn->vn_size =
89             MAX_CHILD_SIZE(Sh) - B_FREE_SPACE(Sh) + tb->insert_size[h];
90
91         /* for internal nodes array if virtual items is not created */
92         if (h) {
93                 vn->vn_nr_item = (vn->vn_size - DC_SIZE) / (DC_SIZE + KEY_SIZE);
94                 return;
95         }
96
97         /* number of items in virtual node  */
98         vn->vn_nr_item =
99             B_NR_ITEMS(Sh) + ((vn->vn_mode == M_INSERT) ? 1 : 0) -
100             ((vn->vn_mode == M_DELETE) ? 1 : 0);
101
102         /* first virtual item */
103         vn->vn_vi = (struct virtual_item *)(tb->tb_vn + 1);
104         memset(vn->vn_vi, 0, vn->vn_nr_item * sizeof(struct virtual_item));
105         vn->vn_free_ptr += vn->vn_nr_item * sizeof(struct virtual_item);
106
107         /* first item in the node */
108         ih = B_N_PITEM_HEAD(Sh, 0);
109
110         /* define the mergeability for 0-th item (if it is not being deleted) */
111         if (op_is_left_mergeable(&(ih->ih_key), Sh->b_size)
112             && (vn->vn_mode != M_DELETE || vn->vn_affected_item_num))
113                 vn->vn_vi[0].vi_type |= VI_TYPE_LEFT_MERGEABLE;
114
115         /* go through all items those remain in the virtual node (except for the new (inserted) one) */
116         for (new_num = 0; new_num < vn->vn_nr_item; new_num++) {
117                 int j;
118                 struct virtual_item *vi = vn->vn_vi + new_num;
119                 int is_affected =
120                     ((new_num != vn->vn_affected_item_num) ? 0 : 1);
121
122                 if (is_affected && vn->vn_mode == M_INSERT)
123                         continue;
124
125                 /* get item number in source node */
126                 j = old_item_num(new_num, vn->vn_affected_item_num,
127                                  vn->vn_mode);
128
129                 vi->vi_item_len += ih_item_len(ih + j) + IH_SIZE;
130                 vi->vi_ih = ih + j;
131                 vi->vi_item = B_I_PITEM(Sh, ih + j);
132                 vi->vi_uarea = vn->vn_free_ptr;
133
134                 // FIXME: there is no check, that item operation did not
135                 // consume too much memory
136                 vn->vn_free_ptr +=
137                     op_create_vi(vn, vi, is_affected, tb->insert_size[0]);
138                 if (tb->vn_buf + tb->vn_buf_size < vn->vn_free_ptr)
139                         reiserfs_panic(tb->tb_sb,
140                                        "vs-8030: create_virtual_node: "
141                                        "virtual node space consumed");
142
143                 if (!is_affected)
144                         /* this is not being changed */
145                         continue;
146
147                 if (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT) {
148                         vn->vn_vi[new_num].vi_item_len += tb->insert_size[0];
149                         vi->vi_new_data = vn->vn_data;  // pointer to data which is going to be pasted
150                 }
151         }
152
153         /* virtual inserted item is not defined yet */
154         if (vn->vn_mode == M_INSERT) {
155                 struct virtual_item *vi = vn->vn_vi + vn->vn_affected_item_num;
156
157                 RFALSE(vn->vn_ins_ih == 0,
158                        "vs-8040: item header of inserted item is not specified");
159                 vi->vi_item_len = tb->insert_size[0];
160                 vi->vi_ih = vn->vn_ins_ih;
161                 vi->vi_item = vn->vn_data;
162                 vi->vi_uarea = vn->vn_free_ptr;
163
164                 op_create_vi(vn, vi, 0 /*not pasted or cut */ ,
165                              tb->insert_size[0]);
166         }
167
168         /* set right merge flag we take right delimiting key and check whether it is a mergeable item */
169         if (tb->CFR[0]) {
170                 struct reiserfs_key *key;
171
172                 key = B_N_PDELIM_KEY(tb->CFR[0], tb->rkey[0]);
173                 if (op_is_left_mergeable(key, Sh->b_size)
174                     && (vn->vn_mode != M_DELETE
175                         || vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1))
176                         vn->vn_vi[vn->vn_nr_item - 1].vi_type |=
177                             VI_TYPE_RIGHT_MERGEABLE;
178
179 #ifdef CONFIG_REISERFS_CHECK
180                 if (op_is_left_mergeable(key, Sh->b_size) &&
181                     !(vn->vn_mode != M_DELETE
182                       || vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1)) {
183                         /* we delete last item and it could be merged with right neighbor's first item */
184                         if (!
185                             (B_NR_ITEMS(Sh) == 1
186                              && is_direntry_le_ih(B_N_PITEM_HEAD(Sh, 0))
187                              && I_ENTRY_COUNT(B_N_PITEM_HEAD(Sh, 0)) == 1)) {
188                                 /* node contains more than 1 item, or item is not directory item, or this item contains more than 1 entry */
189                                 print_block(Sh, 0, -1, -1);
190                                 reiserfs_panic(tb->tb_sb,
191                                                "vs-8045: create_virtual_node: rdkey %k, affected item==%d (mode==%c) Must be %c",
192                                                key, vn->vn_affected_item_num,
193                                                vn->vn_mode, M_DELETE);
194                         } else
195                                 /* we can delete directory item, that has only one directory entry in it */
196                                 ;
197                 }
198 #endif
199
200         }
201 }
202
203 /* using virtual node check, how many items can be shifted to left
204    neighbor */
205 static void check_left(struct tree_balance *tb, int h, int cur_free)
206 {
207         int i;
208         struct virtual_node *vn = tb->tb_vn;
209         struct virtual_item *vi;
210         int d_size, ih_size;
211
212         RFALSE(cur_free < 0, "vs-8050: cur_free (%d) < 0", cur_free);
213
214         /* internal level */
215         if (h > 0) {
216                 tb->lnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
217                 return;
218         }
219
220         /* leaf level */
221
222         if (!cur_free || !vn->vn_nr_item) {
223                 /* no free space or nothing to move */
224                 tb->lnum[h] = 0;
225                 tb->lbytes = -1;
226                 return;
227         }
228
229         RFALSE(!PATH_H_PPARENT(tb->tb_path, 0),
230                "vs-8055: parent does not exist or invalid");
231
232         vi = vn->vn_vi;
233         if ((unsigned int)cur_free >=
234             (vn->vn_size -
235              ((vi->vi_type & VI_TYPE_LEFT_MERGEABLE) ? IH_SIZE : 0))) {
236                 /* all contents of S[0] fits into L[0] */
237
238                 RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
239                        "vs-8055: invalid mode or balance condition failed");
240
241                 tb->lnum[0] = vn->vn_nr_item;
242                 tb->lbytes = -1;
243                 return;
244         }
245
246         d_size = 0, ih_size = IH_SIZE;
247
248         /* first item may be merge with last item in left neighbor */
249         if (vi->vi_type & VI_TYPE_LEFT_MERGEABLE)
250                 d_size = -((int)IH_SIZE), ih_size = 0;
251
252         tb->lnum[0] = 0;
253         for (i = 0; i < vn->vn_nr_item;
254              i++, ih_size = IH_SIZE, d_size = 0, vi++) {
255                 d_size += vi->vi_item_len;
256                 if (cur_free >= d_size) {
257                         /* the item can be shifted entirely */
258                         cur_free -= d_size;
259                         tb->lnum[0]++;
260                         continue;
261                 }
262
263                 /* the item cannot be shifted entirely, try to split it */
264                 /* check whether L[0] can hold ih and at least one byte of the item body */
265                 if (cur_free <= ih_size) {
266                         /* cannot shift even a part of the current item */
267                         tb->lbytes = -1;
268                         return;
269                 }
270                 cur_free -= ih_size;
271
272                 tb->lbytes = op_check_left(vi, cur_free, 0, 0);
273                 if (tb->lbytes != -1)
274                         /* count partially shifted item */
275                         tb->lnum[0]++;
276
277                 break;
278         }
279
280         return;
281 }
282
283 /* using virtual node check, how many items can be shifted to right
284    neighbor */
285 static void check_right(struct tree_balance *tb, int h, int cur_free)
286 {
287         int i;
288         struct virtual_node *vn = tb->tb_vn;
289         struct virtual_item *vi;
290         int d_size, ih_size;
291
292         RFALSE(cur_free < 0, "vs-8070: cur_free < 0");
293
294         /* internal level */
295         if (h > 0) {
296                 tb->rnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
297                 return;
298         }
299
300         /* leaf level */
301
302         if (!cur_free || !vn->vn_nr_item) {
303                 /* no free space  */
304                 tb->rnum[h] = 0;
305                 tb->rbytes = -1;
306                 return;
307         }
308
309         RFALSE(!PATH_H_PPARENT(tb->tb_path, 0),
310                "vs-8075: parent does not exist or invalid");
311
312         vi = vn->vn_vi + vn->vn_nr_item - 1;
313         if ((unsigned int)cur_free >=
314             (vn->vn_size -
315              ((vi->vi_type & VI_TYPE_RIGHT_MERGEABLE) ? IH_SIZE : 0))) {
316                 /* all contents of S[0] fits into R[0] */
317
318                 RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
319                        "vs-8080: invalid mode or balance condition failed");
320
321                 tb->rnum[h] = vn->vn_nr_item;
322                 tb->rbytes = -1;
323                 return;
324         }
325
326         d_size = 0, ih_size = IH_SIZE;
327
328         /* last item may be merge with first item in right neighbor */
329         if (vi->vi_type & VI_TYPE_RIGHT_MERGEABLE)
330                 d_size = -(int)IH_SIZE, ih_size = 0;
331
332         tb->rnum[0] = 0;
333         for (i = vn->vn_nr_item - 1; i >= 0;
334              i--, d_size = 0, ih_size = IH_SIZE, vi--) {
335                 d_size += vi->vi_item_len;
336                 if (cur_free >= d_size) {
337                         /* the item can be shifted entirely */
338                         cur_free -= d_size;
339                         tb->rnum[0]++;
340                         continue;
341                 }
342
343                 /* check whether R[0] can hold ih and at least one byte of the item body */
344                 if (cur_free <= ih_size) {      /* cannot shift even a part of the current item */
345                         tb->rbytes = -1;
346                         return;
347                 }
348
349                 /* R[0] can hold the header of the item and at least one byte of its body */
350                 cur_free -= ih_size;    /* cur_free is still > 0 */
351
352                 tb->rbytes = op_check_right(vi, cur_free);
353                 if (tb->rbytes != -1)
354                         /* count partially shifted item */
355                         tb->rnum[0]++;
356
357                 break;
358         }
359
360         return;
361 }
362
363 /*
364  * from - number of items, which are shifted to left neighbor entirely
365  * to - number of item, which are shifted to right neighbor entirely
366  * from_bytes - number of bytes of boundary item (or directory entries) which are shifted to left neighbor
367  * to_bytes - number of bytes of boundary item (or directory entries) which are shifted to right neighbor */
368 static int get_num_ver(int mode, struct tree_balance *tb, int h,
369                        int from, int from_bytes,
370                        int to, int to_bytes, short *snum012, int flow)
371 {
372         int i;
373         int cur_free;
374         //    int bytes;
375         int units;
376         struct virtual_node *vn = tb->tb_vn;
377         //    struct virtual_item * vi;
378
379         int total_node_size, max_node_size, current_item_size;
380         int needed_nodes;
381         int start_item,         /* position of item we start filling node from */
382          end_item,              /* position of item we finish filling node by */
383          start_bytes,           /* number of first bytes (entries for directory) of start_item-th item 
384                                    we do not include into node that is being filled */
385          end_bytes;             /* number of last bytes (entries for directory) of end_item-th item 
386                                    we do node include into node that is being filled */
387         int split_item_positions[2];    /* these are positions in virtual item of
388                                            items, that are split between S[0] and
389                                            S1new and S1new and S2new */
390
391         split_item_positions[0] = -1;
392         split_item_positions[1] = -1;
393
394         /* We only create additional nodes if we are in insert or paste mode
395            or we are in replace mode at the internal level. If h is 0 and
396            the mode is M_REPLACE then in fix_nodes we change the mode to
397            paste or insert before we get here in the code.  */
398         RFALSE(tb->insert_size[h] < 0 || (mode != M_INSERT && mode != M_PASTE),
399                "vs-8100: insert_size < 0 in overflow");
400
401         max_node_size = MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, h));
402
403         /* snum012 [0-2] - number of items, that lay
404            to S[0], first new node and second new node */
405         snum012[3] = -1;        /* s1bytes */
406         snum012[4] = -1;        /* s2bytes */
407
408         /* internal level */
409         if (h > 0) {
410                 i = ((to - from) * (KEY_SIZE + DC_SIZE) + DC_SIZE);
411                 if (i == max_node_size)
412                         return 1;
413                 return (i / max_node_size + 1);
414         }
415
416         /* leaf level */
417         needed_nodes = 1;
418         total_node_size = 0;
419         cur_free = max_node_size;
420
421         // start from 'from'-th item
422         start_item = from;
423         // skip its first 'start_bytes' units
424         start_bytes = ((from_bytes != -1) ? from_bytes : 0);
425
426         // last included item is the 'end_item'-th one
427         end_item = vn->vn_nr_item - to - 1;
428         // do not count last 'end_bytes' units of 'end_item'-th item
429         end_bytes = (to_bytes != -1) ? to_bytes : 0;
430
431         /* go through all item beginning from the start_item-th item and ending by
432            the end_item-th item. Do not count first 'start_bytes' units of
433            'start_item'-th item and last 'end_bytes' of 'end_item'-th item */
434
435         for (i = start_item; i <= end_item; i++) {
436                 struct virtual_item *vi = vn->vn_vi + i;
437                 int skip_from_end = ((i == end_item) ? end_bytes : 0);
438
439                 RFALSE(needed_nodes > 3, "vs-8105: too many nodes are needed");
440
441                 /* get size of current item */
442                 current_item_size = vi->vi_item_len;
443
444                 /* do not take in calculation head part (from_bytes) of from-th item */
445                 current_item_size -=
446                     op_part_size(vi, 0 /*from start */ , start_bytes);
447
448                 /* do not take in calculation tail part of last item */
449                 current_item_size -=
450                     op_part_size(vi, 1 /*from end */ , skip_from_end);
451
452                 /* if item fits into current node entierly */
453                 if (total_node_size + current_item_size <= max_node_size) {
454                         snum012[needed_nodes - 1]++;
455                         total_node_size += current_item_size;
456                         start_bytes = 0;
457                         continue;
458                 }
459
460                 if (current_item_size > max_node_size) {
461                         /* virtual item length is longer, than max size of item in
462                            a node. It is impossible for direct item */
463                         RFALSE(is_direct_le_ih(vi->vi_ih),
464                                "vs-8110: "
465                                "direct item length is %d. It can not be longer than %d",
466                                current_item_size, max_node_size);
467                         /* we will try to split it */
468                         flow = 1;
469                 }
470
471                 if (!flow) {
472                         /* as we do not split items, take new node and continue */
473                         needed_nodes++;
474                         i--;
475                         total_node_size = 0;
476                         continue;
477                 }
478                 // calculate number of item units which fit into node being
479                 // filled
480                 {
481                         int free_space;
482
483                         free_space = max_node_size - total_node_size - IH_SIZE;
484                         units =
485                             op_check_left(vi, free_space, start_bytes,
486                                           skip_from_end);
487                         if (units == -1) {
488                                 /* nothing fits into current node, take new node and continue */
489                                 needed_nodes++, i--, total_node_size = 0;
490                                 continue;
491                         }
492                 }
493
494                 /* something fits into the current node */
495                 //if (snum012[3] != -1 || needed_nodes != 1)
496                 //  reiserfs_panic (tb->tb_sb, "vs-8115: get_num_ver: too many nodes required");
497                 //snum012[needed_nodes - 1 + 3] = op_unit_num (vi) - start_bytes - units;
498                 start_bytes += units;
499                 snum012[needed_nodes - 1 + 3] = units;
500
501                 if (needed_nodes > 2)
502                         reiserfs_warning(tb->tb_sb, "vs-8111: get_num_ver: "
503                                          "split_item_position is out of boundary");
504                 snum012[needed_nodes - 1]++;
505                 split_item_positions[needed_nodes - 1] = i;
506                 needed_nodes++;
507                 /* continue from the same item with start_bytes != -1 */
508                 start_item = i;
509                 i--;
510                 total_node_size = 0;
511         }
512
513         // sum012[4] (if it is not -1) contains number of units of which
514         // are to be in S1new, snum012[3] - to be in S0. They are supposed
515         // to be S1bytes and S2bytes correspondingly, so recalculate
516         if (snum012[4] > 0) {
517                 int split_item_num;
518                 int bytes_to_r, bytes_to_l;
519                 int bytes_to_S1new;
520
521                 split_item_num = split_item_positions[1];
522                 bytes_to_l =
523                     ((from == split_item_num
524                       && from_bytes != -1) ? from_bytes : 0);
525                 bytes_to_r =
526                     ((end_item == split_item_num
527                       && end_bytes != -1) ? end_bytes : 0);
528                 bytes_to_S1new =
529                     ((split_item_positions[0] ==
530                       split_item_positions[1]) ? snum012[3] : 0);
531
532                 // s2bytes
533                 snum012[4] =
534                     op_unit_num(&vn->vn_vi[split_item_num]) - snum012[4] -
535                     bytes_to_r - bytes_to_l - bytes_to_S1new;
536
537                 if (vn->vn_vi[split_item_num].vi_index != TYPE_DIRENTRY &&
538                     vn->vn_vi[split_item_num].vi_index != TYPE_INDIRECT)
539                         reiserfs_warning(tb->tb_sb, "vs-8115: get_num_ver: not "
540                                          "directory or indirect item");
541         }
542
543         /* now we know S2bytes, calculate S1bytes */
544         if (snum012[3] > 0) {
545                 int split_item_num;
546                 int bytes_to_r, bytes_to_l;
547                 int bytes_to_S2new;
548
549                 split_item_num = split_item_positions[0];
550                 bytes_to_l =
551                     ((from == split_item_num
552                       && from_bytes != -1) ? from_bytes : 0);
553                 bytes_to_r =
554                     ((end_item == split_item_num
555                       && end_bytes != -1) ? end_bytes : 0);
556                 bytes_to_S2new =
557                     ((split_item_positions[0] == split_item_positions[1]
558                       && snum012[4] != -1) ? snum012[4] : 0);
559
560                 // s1bytes
561                 snum012[3] =
562                     op_unit_num(&vn->vn_vi[split_item_num]) - snum012[3] -
563                     bytes_to_r - bytes_to_l - bytes_to_S2new;
564         }
565
566         return needed_nodes;
567 }
568
569 #ifdef CONFIG_REISERFS_CHECK
570 extern struct tree_balance *cur_tb;
571 #endif
572
573 /* Set parameters for balancing.
574  * Performs write of results of analysis of balancing into structure tb,
575  * where it will later be used by the functions that actually do the balancing. 
576  * Parameters:
577  *      tb      tree_balance structure;
578  *      h       current level of the node;
579  *      lnum    number of items from S[h] that must be shifted to L[h];
580  *      rnum    number of items from S[h] that must be shifted to R[h];
581  *      blk_num number of blocks that S[h] will be splitted into;
582  *      s012    number of items that fall into splitted nodes.
583  *      lbytes  number of bytes which flow to the left neighbor from the item that is not
584  *              not shifted entirely
585  *      rbytes  number of bytes which flow to the right neighbor from the item that is not
586  *              not shifted entirely
587  *      s1bytes number of bytes which flow to the first  new node when S[0] splits (this number is contained in s012 array)
588  */
589
590 static void set_parameters(struct tree_balance *tb, int h, int lnum,
591                            int rnum, int blk_num, short *s012, int lb, int rb)
592 {
593
594         tb->lnum[h] = lnum;
595         tb->rnum[h] = rnum;
596         tb->blknum[h] = blk_num;
597
598         if (h == 0) {           /* only for leaf level */
599                 if (s012 != NULL) {
600                         tb->s0num = *s012++,
601                             tb->s1num = *s012++, tb->s2num = *s012++;
602                         tb->s1bytes = *s012++;
603                         tb->s2bytes = *s012;
604                 }
605                 tb->lbytes = lb;
606                 tb->rbytes = rb;
607         }
608         PROC_INFO_ADD(tb->tb_sb, lnum[h], lnum);
609         PROC_INFO_ADD(tb->tb_sb, rnum[h], rnum);
610
611         PROC_INFO_ADD(tb->tb_sb, lbytes[h], lb);
612         PROC_INFO_ADD(tb->tb_sb, rbytes[h], rb);
613 }
614
615 /* check, does node disappear if we shift tb->lnum[0] items to left
616    neighbor and tb->rnum[0] to the right one. */
617 static int is_leaf_removable(struct tree_balance *tb)
618 {
619         struct virtual_node *vn = tb->tb_vn;
620         int to_left, to_right;
621         int size;
622         int remain_items;
623
624         /* number of items, that will be shifted to left (right) neighbor
625            entirely */
626         to_left = tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0);
627         to_right = tb->rnum[0] - ((tb->rbytes != -1) ? 1 : 0);
628         remain_items = vn->vn_nr_item;
629
630         /* how many items remain in S[0] after shiftings to neighbors */
631         remain_items -= (to_left + to_right);
632
633         if (remain_items < 1) {
634                 /* all content of node can be shifted to neighbors */
635                 set_parameters(tb, 0, to_left, vn->vn_nr_item - to_left, 0,
636                                NULL, -1, -1);
637                 return 1;
638         }
639
640         if (remain_items > 1 || tb->lbytes == -1 || tb->rbytes == -1)
641                 /* S[0] is not removable */
642                 return 0;
643
644         /* check, whether we can divide 1 remaining item between neighbors */
645
646         /* get size of remaining item (in item units) */
647         size = op_unit_num(&(vn->vn_vi[to_left]));
648
649         if (tb->lbytes + tb->rbytes >= size) {
650                 set_parameters(tb, 0, to_left + 1, to_right + 1, 0, NULL,
651                                tb->lbytes, -1);
652                 return 1;
653         }
654
655         return 0;
656 }
657
658 /* check whether L, S, R can be joined in one node */
659 static int are_leaves_removable(struct tree_balance *tb, int lfree, int rfree)
660 {
661         struct virtual_node *vn = tb->tb_vn;
662         int ih_size;
663         struct buffer_head *S0;
664
665         S0 = PATH_H_PBUFFER(tb->tb_path, 0);
666
667         ih_size = 0;
668         if (vn->vn_nr_item) {
669                 if (vn->vn_vi[0].vi_type & VI_TYPE_LEFT_MERGEABLE)
670                         ih_size += IH_SIZE;
671
672                 if (vn->vn_vi[vn->vn_nr_item - 1].
673                     vi_type & VI_TYPE_RIGHT_MERGEABLE)
674                         ih_size += IH_SIZE;
675         } else {
676                 /* there was only one item and it will be deleted */
677                 struct item_head *ih;
678
679                 RFALSE(B_NR_ITEMS(S0) != 1,
680                        "vs-8125: item number must be 1: it is %d",
681                        B_NR_ITEMS(S0));
682
683                 ih = B_N_PITEM_HEAD(S0, 0);
684                 if (tb->CFR[0]
685                     && !comp_short_le_keys(&(ih->ih_key),
686                                            B_N_PDELIM_KEY(tb->CFR[0],
687                                                           tb->rkey[0])))
688                         if (is_direntry_le_ih(ih)) {
689                                 /* Directory must be in correct state here: that is
690                                    somewhere at the left side should exist first directory
691                                    item. But the item being deleted can not be that first
692                                    one because its right neighbor is item of the same
693                                    directory. (But first item always gets deleted in last
694                                    turn). So, neighbors of deleted item can be merged, so
695                                    we can save ih_size */
696                                 ih_size = IH_SIZE;
697
698                                 /* we might check that left neighbor exists and is of the
699                                    same directory */
700                                 RFALSE(le_ih_k_offset(ih) == DOT_OFFSET,
701                                        "vs-8130: first directory item can not be removed until directory is not empty");
702                         }
703
704         }
705
706         if (MAX_CHILD_SIZE(S0) + vn->vn_size <= rfree + lfree + ih_size) {
707                 set_parameters(tb, 0, -1, -1, -1, NULL, -1, -1);
708                 PROC_INFO_INC(tb->tb_sb, leaves_removable);
709                 return 1;
710         }
711         return 0;
712
713 }
714
715 /* when we do not split item, lnum and rnum are numbers of entire items */
716 #define SET_PAR_SHIFT_LEFT \
717 if (h)\
718 {\
719    int to_l;\
720    \
721    to_l = (MAX_NR_KEY(Sh)+1 - lpar + vn->vn_nr_item + 1) / 2 -\
722               (MAX_NR_KEY(Sh) + 1 - lpar);\
723               \
724               set_parameters (tb, h, to_l, 0, lnver, NULL, -1, -1);\
725 }\
726 else \
727 {\
728    if (lset==LEFT_SHIFT_FLOW)\
729      set_parameters (tb, h, lpar, 0, lnver, snum012+lset,\
730                      tb->lbytes, -1);\
731    else\
732      set_parameters (tb, h, lpar - (tb->lbytes!=-1), 0, lnver, snum012+lset,\
733                      -1, -1);\
734 }
735
736 #define SET_PAR_SHIFT_RIGHT \
737 if (h)\
738 {\
739    int to_r;\
740    \
741    to_r = (MAX_NR_KEY(Sh)+1 - rpar + vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - rpar);\
742    \
743    set_parameters (tb, h, 0, to_r, rnver, NULL, -1, -1);\
744 }\
745 else \
746 {\
747    if (rset==RIGHT_SHIFT_FLOW)\
748      set_parameters (tb, h, 0, rpar, rnver, snum012+rset,\
749                   -1, tb->rbytes);\
750    else\
751      set_parameters (tb, h, 0, rpar - (tb->rbytes!=-1), rnver, snum012+rset,\
752                   -1, -1);\
753 }
754
755 static void free_buffers_in_tb(struct tree_balance *p_s_tb)
756 {
757         int n_counter;
758
759         decrement_counters_in_path(p_s_tb->tb_path);
760
761         for (n_counter = 0; n_counter < MAX_HEIGHT; n_counter++) {
762                 decrement_bcount(p_s_tb->L[n_counter]);
763                 p_s_tb->L[n_counter] = NULL;
764                 decrement_bcount(p_s_tb->R[n_counter]);
765                 p_s_tb->R[n_counter] = NULL;
766                 decrement_bcount(p_s_tb->FL[n_counter]);
767                 p_s_tb->FL[n_counter] = NULL;
768                 decrement_bcount(p_s_tb->FR[n_counter]);
769                 p_s_tb->FR[n_counter] = NULL;
770                 decrement_bcount(p_s_tb->CFL[n_counter]);
771                 p_s_tb->CFL[n_counter] = NULL;
772                 decrement_bcount(p_s_tb->CFR[n_counter]);
773                 p_s_tb->CFR[n_counter] = NULL;
774         }
775 }
776
777 /* Get new buffers for storing new nodes that are created while balancing.
778  * Returns:     SCHEDULE_OCCURRED - schedule occurred while the function worked;
779  *              CARRY_ON - schedule didn't occur while the function worked;
780  *              NO_DISK_SPACE - no disk space.
781  */
782 /* The function is NOT SCHEDULE-SAFE! */
783 static int get_empty_nodes(struct tree_balance *p_s_tb, int n_h)
784 {
785         struct buffer_head *p_s_new_bh,
786             *p_s_Sh = PATH_H_PBUFFER(p_s_tb->tb_path, n_h);
787         b_blocknr_t *p_n_blocknr, a_n_blocknrs[MAX_AMOUNT_NEEDED] = { 0, };
788         int n_counter, n_number_of_freeblk, n_amount_needed,    /* number of needed empty blocks */
789          n_retval = CARRY_ON;
790         struct super_block *p_s_sb = p_s_tb->tb_sb;
791
792         /* number_of_freeblk is the number of empty blocks which have been
793            acquired for use by the balancing algorithm minus the number of
794            empty blocks used in the previous levels of the analysis,
795            number_of_freeblk = tb->cur_blknum can be non-zero if a schedule occurs
796            after empty blocks are acquired, and the balancing analysis is
797            then restarted, amount_needed is the number needed by this level
798            (n_h) of the balancing analysis.
799
800            Note that for systems with many processes writing, it would be
801            more layout optimal to calculate the total number needed by all
802            levels and then to run reiserfs_new_blocks to get all of them at once.  */
803
804         /* Initiate number_of_freeblk to the amount acquired prior to the restart of
805            the analysis or 0 if not restarted, then subtract the amount needed
806            by all of the levels of the tree below n_h. */
807         /* blknum includes S[n_h], so we subtract 1 in this calculation */
808         for (n_counter = 0, n_number_of_freeblk = p_s_tb->cur_blknum;
809              n_counter < n_h; n_counter++)
810                 n_number_of_freeblk -=
811                     (p_s_tb->blknum[n_counter]) ? (p_s_tb->blknum[n_counter] -
812                                                    1) : 0;
813
814         /* Allocate missing empty blocks. */
815         /* if p_s_Sh == 0  then we are getting a new root */
816         n_amount_needed = (p_s_Sh) ? (p_s_tb->blknum[n_h] - 1) : 1;
817         /*  Amount_needed = the amount that we need more than the amount that we have. */
818         if (n_amount_needed > n_number_of_freeblk)
819                 n_amount_needed -= n_number_of_freeblk;
820         else                    /* If we have enough already then there is nothing to do. */
821                 return CARRY_ON;
822
823         /* No need to check quota - is not allocated for blocks used for formatted nodes */
824         if (reiserfs_new_form_blocknrs(p_s_tb, a_n_blocknrs,
825                                        n_amount_needed) == NO_DISK_SPACE)
826                 return NO_DISK_SPACE;
827
828         /* for each blocknumber we just got, get a buffer and stick it on FEB */
829         for (p_n_blocknr = a_n_blocknrs, n_counter = 0;
830              n_counter < n_amount_needed; p_n_blocknr++, n_counter++) {
831
832                 RFALSE(!*p_n_blocknr,
833                        "PAP-8135: reiserfs_new_blocknrs failed when got new blocks");
834
835                 p_s_new_bh = sb_getblk(p_s_sb, *p_n_blocknr);
836                 RFALSE(buffer_dirty(p_s_new_bh) ||
837                        buffer_journaled(p_s_new_bh) ||
838                        buffer_journal_dirty(p_s_new_bh),
839                        "PAP-8140: journlaled or dirty buffer %b for the new block",
840                        p_s_new_bh);
841
842                 /* Put empty buffers into the array. */
843                 RFALSE(p_s_tb->FEB[p_s_tb->cur_blknum],
844                        "PAP-8141: busy slot for new buffer");
845
846                 set_buffer_journal_new(p_s_new_bh);
847                 p_s_tb->FEB[p_s_tb->cur_blknum++] = p_s_new_bh;
848         }
849
850         if (n_retval == CARRY_ON && FILESYSTEM_CHANGED_TB(p_s_tb))
851                 n_retval = REPEAT_SEARCH;
852
853         return n_retval;
854 }
855
856 /* Get free space of the left neighbor, which is stored in the parent
857  * node of the left neighbor.  */
858 static int get_lfree(struct tree_balance *tb, int h)
859 {
860         struct buffer_head *l, *f;
861         int order;
862
863         if ((f = PATH_H_PPARENT(tb->tb_path, h)) == 0 || (l = tb->FL[h]) == 0)
864                 return 0;
865
866         if (f == l)
867                 order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) - 1;
868         else {
869                 order = B_NR_ITEMS(l);
870                 f = l;
871         }
872
873         return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order)));
874 }
875
876 /* Get free space of the right neighbor,
877  * which is stored in the parent node of the right neighbor.
878  */
879 static int get_rfree(struct tree_balance *tb, int h)
880 {
881         struct buffer_head *r, *f;
882         int order;
883
884         if ((f = PATH_H_PPARENT(tb->tb_path, h)) == 0 || (r = tb->FR[h]) == 0)
885                 return 0;
886
887         if (f == r)
888                 order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) + 1;
889         else {
890                 order = 0;
891                 f = r;
892         }
893
894         return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order)));
895
896 }
897
898 /* Check whether left neighbor is in memory. */
899 static int is_left_neighbor_in_cache(struct tree_balance *p_s_tb, int n_h)
900 {
901         struct buffer_head *p_s_father, *left;
902         struct super_block *p_s_sb = p_s_tb->tb_sb;
903         b_blocknr_t n_left_neighbor_blocknr;
904         int n_left_neighbor_position;
905
906         if (!p_s_tb->FL[n_h])   /* Father of the left neighbor does not exist. */
907                 return 0;
908
909         /* Calculate father of the node to be balanced. */
910         p_s_father = PATH_H_PBUFFER(p_s_tb->tb_path, n_h + 1);
911
912         RFALSE(!p_s_father ||
913                !B_IS_IN_TREE(p_s_father) ||
914                !B_IS_IN_TREE(p_s_tb->FL[n_h]) ||
915                !buffer_uptodate(p_s_father) ||
916                !buffer_uptodate(p_s_tb->FL[n_h]),
917                "vs-8165: F[h] (%b) or FL[h] (%b) is invalid",
918                p_s_father, p_s_tb->FL[n_h]);
919
920         /* Get position of the pointer to the left neighbor into the left father. */
921         n_left_neighbor_position = (p_s_father == p_s_tb->FL[n_h]) ?
922             p_s_tb->lkey[n_h] : B_NR_ITEMS(p_s_tb->FL[n_h]);
923         /* Get left neighbor block number. */
924         n_left_neighbor_blocknr =
925             B_N_CHILD_NUM(p_s_tb->FL[n_h], n_left_neighbor_position);
926         /* Look for the left neighbor in the cache. */
927         if ((left = sb_find_get_block(p_s_sb, n_left_neighbor_blocknr))) {
928
929                 RFALSE(buffer_uptodate(left) && !B_IS_IN_TREE(left),
930                        "vs-8170: left neighbor (%b %z) is not in the tree",
931                        left, left);
932                 put_bh(left);
933                 return 1;
934         }
935
936         return 0;
937 }
938
939 #define LEFT_PARENTS  'l'
940 #define RIGHT_PARENTS 'r'
941
942 static void decrement_key(struct cpu_key *p_s_key)
943 {
944         // call item specific function for this key
945         item_ops[cpu_key_k_type(p_s_key)]->decrement_key(p_s_key);
946 }
947
948 /* Calculate far left/right parent of the left/right neighbor of the current node, that
949  * is calculate the left/right (FL[h]/FR[h]) neighbor of the parent F[h].
950  * Calculate left/right common parent of the current node and L[h]/R[h].
951  * Calculate left/right delimiting key position.
952  * Returns:     PATH_INCORRECT   - path in the tree is not correct;
953                 SCHEDULE_OCCURRED - schedule occurred while the function worked;
954  *              CARRY_ON         - schedule didn't occur while the function worked;
955  */
956 static int get_far_parent(struct tree_balance *p_s_tb,
957                           int n_h,
958                           struct buffer_head **pp_s_father,
959                           struct buffer_head **pp_s_com_father, char c_lr_par)
960 {
961         struct buffer_head *p_s_parent;
962         INITIALIZE_PATH(s_path_to_neighbor_father);
963         struct path *p_s_path = p_s_tb->tb_path;
964         struct cpu_key s_lr_father_key;
965         int n_counter,
966             n_position = INT_MAX,
967             n_first_last_position = 0,
968             n_path_offset = PATH_H_PATH_OFFSET(p_s_path, n_h);
969
970         /* Starting from F[n_h] go upwards in the tree, and look for the common
971            ancestor of F[n_h], and its neighbor l/r, that should be obtained. */
972
973         n_counter = n_path_offset;
974
975         RFALSE(n_counter < FIRST_PATH_ELEMENT_OFFSET,
976                "PAP-8180: invalid path length");
977
978         for (; n_counter > FIRST_PATH_ELEMENT_OFFSET; n_counter--) {
979                 /* Check whether parent of the current buffer in the path is really parent in the tree. */
980                 if (!B_IS_IN_TREE
981                     (p_s_parent = PATH_OFFSET_PBUFFER(p_s_path, n_counter - 1)))
982                         return REPEAT_SEARCH;
983                 /* Check whether position in the parent is correct. */
984                 if ((n_position =
985                      PATH_OFFSET_POSITION(p_s_path,
986                                           n_counter - 1)) >
987                     B_NR_ITEMS(p_s_parent))
988                         return REPEAT_SEARCH;
989                 /* Check whether parent at the path really points to the child. */
990                 if (B_N_CHILD_NUM(p_s_parent, n_position) !=
991                     PATH_OFFSET_PBUFFER(p_s_path, n_counter)->b_blocknr)
992                         return REPEAT_SEARCH;
993                 /* Return delimiting key if position in the parent is not equal to first/last one. */
994                 if (c_lr_par == RIGHT_PARENTS)
995                         n_first_last_position = B_NR_ITEMS(p_s_parent);
996                 if (n_position != n_first_last_position) {
997                         *pp_s_com_father = p_s_parent;
998                         get_bh(*pp_s_com_father);
999                         /*(*pp_s_com_father = p_s_parent)->b_count++; */
1000                         break;
1001                 }
1002         }
1003
1004         /* if we are in the root of the tree, then there is no common father */
1005         if (n_counter == FIRST_PATH_ELEMENT_OFFSET) {
1006                 /* Check whether first buffer in the path is the root of the tree. */
1007                 if (PATH_OFFSET_PBUFFER
1008                     (p_s_tb->tb_path,
1009                      FIRST_PATH_ELEMENT_OFFSET)->b_blocknr ==
1010                     SB_ROOT_BLOCK(p_s_tb->tb_sb)) {
1011                         *pp_s_father = *pp_s_com_father = NULL;
1012                         return CARRY_ON;
1013                 }
1014                 return REPEAT_SEARCH;
1015         }
1016
1017         RFALSE(B_LEVEL(*pp_s_com_father) <= DISK_LEAF_NODE_LEVEL,
1018                "PAP-8185: (%b %z) level too small",
1019                *pp_s_com_father, *pp_s_com_father);
1020
1021         /* Check whether the common parent is locked. */
1022
1023         if (buffer_locked(*pp_s_com_father)) {
1024                 __wait_on_buffer(*pp_s_com_father);
1025                 if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
1026                         decrement_bcount(*pp_s_com_father);
1027                         return REPEAT_SEARCH;
1028                 }
1029         }
1030
1031         /* So, we got common parent of the current node and its left/right neighbor.
1032            Now we are geting the parent of the left/right neighbor. */
1033
1034         /* Form key to get parent of the left/right neighbor. */
1035         le_key2cpu_key(&s_lr_father_key,
1036                        B_N_PDELIM_KEY(*pp_s_com_father,
1037                                       (c_lr_par ==
1038                                        LEFT_PARENTS) ? (p_s_tb->lkey[n_h - 1] =
1039                                                         n_position -
1040                                                         1) : (p_s_tb->rkey[n_h -
1041                                                                            1] =
1042                                                               n_position)));
1043
1044         if (c_lr_par == LEFT_PARENTS)
1045                 decrement_key(&s_lr_father_key);
1046
1047         if (search_by_key
1048             (p_s_tb->tb_sb, &s_lr_father_key, &s_path_to_neighbor_father,
1049              n_h + 1) == IO_ERROR)
1050                 // path is released
1051                 return IO_ERROR;
1052
1053         if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
1054                 decrement_counters_in_path(&s_path_to_neighbor_father);
1055                 decrement_bcount(*pp_s_com_father);
1056                 return REPEAT_SEARCH;
1057         }
1058
1059         *pp_s_father = PATH_PLAST_BUFFER(&s_path_to_neighbor_father);
1060
1061         RFALSE(B_LEVEL(*pp_s_father) != n_h + 1,
1062                "PAP-8190: (%b %z) level too small", *pp_s_father, *pp_s_father);
1063         RFALSE(s_path_to_neighbor_father.path_length <
1064                FIRST_PATH_ELEMENT_OFFSET, "PAP-8192: path length is too small");
1065
1066         s_path_to_neighbor_father.path_length--;
1067         decrement_counters_in_path(&s_path_to_neighbor_father);
1068         return CARRY_ON;
1069 }
1070
1071 /* Get parents of neighbors of node in the path(S[n_path_offset]) and common parents of
1072  * S[n_path_offset] and L[n_path_offset]/R[n_path_offset]: F[n_path_offset], FL[n_path_offset],
1073  * FR[n_path_offset], CFL[n_path_offset], CFR[n_path_offset].
1074  * Calculate numbers of left and right delimiting keys position: lkey[n_path_offset], rkey[n_path_offset].
1075  * Returns:     SCHEDULE_OCCURRED - schedule occurred while the function worked;
1076  *              CARRY_ON - schedule didn't occur while the function worked;
1077  */
1078 static int get_parents(struct tree_balance *p_s_tb, int n_h)
1079 {
1080         struct path *p_s_path = p_s_tb->tb_path;
1081         int n_position,
1082             n_ret_value,
1083             n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h);
1084         struct buffer_head *p_s_curf, *p_s_curcf;
1085
1086         /* Current node is the root of the tree or will be root of the tree */
1087         if (n_path_offset <= FIRST_PATH_ELEMENT_OFFSET) {
1088                 /* The root can not have parents.
1089                    Release nodes which previously were obtained as parents of the current node neighbors. */
1090                 decrement_bcount(p_s_tb->FL[n_h]);
1091                 decrement_bcount(p_s_tb->CFL[n_h]);
1092                 decrement_bcount(p_s_tb->FR[n_h]);
1093                 decrement_bcount(p_s_tb->CFR[n_h]);
1094                 p_s_tb->FL[n_h] = p_s_tb->CFL[n_h] = p_s_tb->FR[n_h] =
1095                     p_s_tb->CFR[n_h] = NULL;
1096                 return CARRY_ON;
1097         }
1098
1099         /* Get parent FL[n_path_offset] of L[n_path_offset]. */
1100         if ((n_position = PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1))) {
1101                 /* Current node is not the first child of its parent. */
1102                 /*(p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1))->b_count += 2; */
1103                 p_s_curf = p_s_curcf =
1104                     PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1);
1105                 get_bh(p_s_curf);
1106                 get_bh(p_s_curf);
1107                 p_s_tb->lkey[n_h] = n_position - 1;
1108         } else {
1109                 /* Calculate current parent of L[n_path_offset], which is the left neighbor of the current node.
1110                    Calculate current common parent of L[n_path_offset] and the current node. Note that
1111                    CFL[n_path_offset] not equal FL[n_path_offset] and CFL[n_path_offset] not equal F[n_path_offset].
1112                    Calculate lkey[n_path_offset]. */
1113                 if ((n_ret_value = get_far_parent(p_s_tb, n_h + 1, &p_s_curf,
1114                                                   &p_s_curcf,
1115                                                   LEFT_PARENTS)) != CARRY_ON)
1116                         return n_ret_value;
1117         }
1118
1119         decrement_bcount(p_s_tb->FL[n_h]);
1120         p_s_tb->FL[n_h] = p_s_curf;     /* New initialization of FL[n_h]. */
1121         decrement_bcount(p_s_tb->CFL[n_h]);
1122         p_s_tb->CFL[n_h] = p_s_curcf;   /* New initialization of CFL[n_h]. */
1123
1124         RFALSE((p_s_curf && !B_IS_IN_TREE(p_s_curf)) ||
1125                (p_s_curcf && !B_IS_IN_TREE(p_s_curcf)),
1126                "PAP-8195: FL (%b) or CFL (%b) is invalid", p_s_curf, p_s_curcf);
1127
1128 /* Get parent FR[n_h] of R[n_h]. */
1129
1130 /* Current node is the last child of F[n_h]. FR[n_h] != F[n_h]. */
1131         if (n_position == B_NR_ITEMS(PATH_H_PBUFFER(p_s_path, n_h + 1))) {
1132 /* Calculate current parent of R[n_h], which is the right neighbor of F[n_h].
1133    Calculate current common parent of R[n_h] and current node. Note that CFR[n_h]
1134    not equal FR[n_path_offset] and CFR[n_h] not equal F[n_h]. */
1135                 if ((n_ret_value =
1136                      get_far_parent(p_s_tb, n_h + 1, &p_s_curf, &p_s_curcf,
1137                                     RIGHT_PARENTS)) != CARRY_ON)
1138                         return n_ret_value;
1139         } else {
1140 /* Current node is not the last child of its parent F[n_h]. */
1141                 /*(p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1))->b_count += 2; */
1142                 p_s_curf = p_s_curcf =
1143                     PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1);
1144                 get_bh(p_s_curf);
1145                 get_bh(p_s_curf);
1146                 p_s_tb->rkey[n_h] = n_position;
1147         }
1148
1149         decrement_bcount(p_s_tb->FR[n_h]);
1150         p_s_tb->FR[n_h] = p_s_curf;     /* New initialization of FR[n_path_offset]. */
1151
1152         decrement_bcount(p_s_tb->CFR[n_h]);
1153         p_s_tb->CFR[n_h] = p_s_curcf;   /* New initialization of CFR[n_path_offset]. */
1154
1155         RFALSE((p_s_curf && !B_IS_IN_TREE(p_s_curf)) ||
1156                (p_s_curcf && !B_IS_IN_TREE(p_s_curcf)),
1157                "PAP-8205: FR (%b) or CFR (%b) is invalid", p_s_curf, p_s_curcf);
1158
1159         return CARRY_ON;
1160 }
1161
1162 /* it is possible to remove node as result of shiftings to
1163    neighbors even when we insert or paste item. */
1164 static inline int can_node_be_removed(int mode, int lfree, int sfree, int rfree,
1165                                       struct tree_balance *tb, int h)
1166 {
1167         struct buffer_head *Sh = PATH_H_PBUFFER(tb->tb_path, h);
1168         int levbytes = tb->insert_size[h];
1169         struct item_head *ih;
1170         struct reiserfs_key *r_key = NULL;
1171
1172         ih = B_N_PITEM_HEAD(Sh, 0);
1173         if (tb->CFR[h])
1174                 r_key = B_N_PDELIM_KEY(tb->CFR[h], tb->rkey[h]);
1175
1176         if (lfree + rfree + sfree < MAX_CHILD_SIZE(Sh) + levbytes
1177             /* shifting may merge items which might save space */
1178             -
1179             ((!h
1180               && op_is_left_mergeable(&(ih->ih_key), Sh->b_size)) ? IH_SIZE : 0)
1181             -
1182             ((!h && r_key
1183               && op_is_left_mergeable(r_key, Sh->b_size)) ? IH_SIZE : 0)
1184             + ((h) ? KEY_SIZE : 0)) {
1185                 /* node can not be removed */
1186                 if (sfree >= levbytes) {        /* new item fits into node S[h] without any shifting */
1187                         if (!h)
1188                                 tb->s0num =
1189                                     B_NR_ITEMS(Sh) +
1190                                     ((mode == M_INSERT) ? 1 : 0);
1191                         set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1192                         return NO_BALANCING_NEEDED;
1193                 }
1194         }
1195         PROC_INFO_INC(tb->tb_sb, can_node_be_removed[h]);
1196         return !NO_BALANCING_NEEDED;
1197 }
1198
1199 /* Check whether current node S[h] is balanced when increasing its size by
1200  * Inserting or Pasting.
1201  * Calculate parameters for balancing for current level h.
1202  * Parameters:
1203  *      tb      tree_balance structure;
1204  *      h       current level of the node;
1205  *      inum    item number in S[h];
1206  *      mode    i - insert, p - paste;
1207  * Returns:     1 - schedule occurred; 
1208  *              0 - balancing for higher levels needed;
1209  *             -1 - no balancing for higher levels needed;
1210  *             -2 - no disk space.
1211  */
1212 /* ip means Inserting or Pasting */
1213 static int ip_check_balance(struct tree_balance *tb, int h)
1214 {
1215         struct virtual_node *vn = tb->tb_vn;
1216         int levbytes,           /* Number of bytes that must be inserted into (value
1217                                    is negative if bytes are deleted) buffer which
1218                                    contains node being balanced.  The mnemonic is
1219                                    that the attempted change in node space used level
1220                                    is levbytes bytes. */
1221          n_ret_value;
1222
1223         int lfree, sfree, rfree /* free space in L, S and R */ ;
1224
1225         /* nver is short for number of vertixes, and lnver is the number if
1226            we shift to the left, rnver is the number if we shift to the
1227            right, and lrnver is the number if we shift in both directions.
1228            The goal is to minimize first the number of vertixes, and second,
1229            the number of vertixes whose contents are changed by shifting,
1230            and third the number of uncached vertixes whose contents are
1231            changed by shifting and must be read from disk.  */
1232         int nver, lnver, rnver, lrnver;
1233
1234         /* used at leaf level only, S0 = S[0] is the node being balanced,
1235            sInum [ I = 0,1,2 ] is the number of items that will
1236            remain in node SI after balancing.  S1 and S2 are new
1237            nodes that might be created. */
1238
1239         /* we perform 8 calls to get_num_ver().  For each call we calculate five parameters.
1240            where 4th parameter is s1bytes and 5th - s2bytes
1241          */
1242         short snum012[40] = { 0, };     /* s0num, s1num, s2num for 8 cases 
1243                                            0,1 - do not shift and do not shift but bottle
1244                                            2 - shift only whole item to left
1245                                            3 - shift to left and bottle as much as possible
1246                                            4,5 - shift to right (whole items and as much as possible
1247                                            6,7 - shift to both directions (whole items and as much as possible)
1248                                          */
1249
1250         /* Sh is the node whose balance is currently being checked */
1251         struct buffer_head *Sh;
1252
1253         Sh = PATH_H_PBUFFER(tb->tb_path, h);
1254         levbytes = tb->insert_size[h];
1255
1256         /* Calculate balance parameters for creating new root. */
1257         if (!Sh) {
1258                 if (!h)
1259                         reiserfs_panic(tb->tb_sb,
1260                                        "vs-8210: ip_check_balance: S[0] can not be 0");
1261                 switch (n_ret_value = get_empty_nodes(tb, h)) {
1262                 case CARRY_ON:
1263                         set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1264                         return NO_BALANCING_NEEDED;     /* no balancing for higher levels needed */
1265
1266                 case NO_DISK_SPACE:
1267                 case REPEAT_SEARCH:
1268                         return n_ret_value;
1269                 default:
1270                         reiserfs_panic(tb->tb_sb,
1271                                        "vs-8215: ip_check_balance: incorrect return value of get_empty_nodes");
1272                 }
1273         }
1274
1275         if ((n_ret_value = get_parents(tb, h)) != CARRY_ON)     /* get parents of S[h] neighbors. */
1276                 return n_ret_value;
1277
1278         sfree = B_FREE_SPACE(Sh);
1279
1280         /* get free space of neighbors */
1281         rfree = get_rfree(tb, h);
1282         lfree = get_lfree(tb, h);
1283
1284         if (can_node_be_removed(vn->vn_mode, lfree, sfree, rfree, tb, h) ==
1285             NO_BALANCING_NEEDED)
1286                 /* and new item fits into node S[h] without any shifting */
1287                 return NO_BALANCING_NEEDED;
1288
1289         create_virtual_node(tb, h);
1290
1291         /*  
1292            determine maximal number of items we can shift to the left neighbor (in tb structure)
1293            and the maximal number of bytes that can flow to the left neighbor
1294            from the left most liquid item that cannot be shifted from S[0] entirely (returned value)
1295          */
1296         check_left(tb, h, lfree);
1297
1298         /*
1299            determine maximal number of items we can shift to the right neighbor (in tb structure)
1300            and the maximal number of bytes that can flow to the right neighbor
1301            from the right most liquid item that cannot be shifted from S[0] entirely (returned value)
1302          */
1303         check_right(tb, h, rfree);
1304
1305         /* all contents of internal node S[h] can be moved into its
1306            neighbors, S[h] will be removed after balancing */
1307         if (h && (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)) {
1308                 int to_r;
1309
1310                 /* Since we are working on internal nodes, and our internal
1311                    nodes have fixed size entries, then we can balance by the
1312                    number of items rather than the space they consume.  In this
1313                    routine we set the left node equal to the right node,
1314                    allowing a difference of less than or equal to 1 child
1315                    pointer. */
1316                 to_r =
1317                     ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] +
1318                      vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 -
1319                                                 tb->rnum[h]);
1320                 set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL,
1321                                -1, -1);
1322                 return CARRY_ON;
1323         }
1324
1325         /* this checks balance condition, that any two neighboring nodes can not fit in one node */
1326         RFALSE(h &&
1327                (tb->lnum[h] >= vn->vn_nr_item + 1 ||
1328                 tb->rnum[h] >= vn->vn_nr_item + 1),
1329                "vs-8220: tree is not balanced on internal level");
1330         RFALSE(!h && ((tb->lnum[h] >= vn->vn_nr_item && (tb->lbytes == -1)) ||
1331                       (tb->rnum[h] >= vn->vn_nr_item && (tb->rbytes == -1))),
1332                "vs-8225: tree is not balanced on leaf level");
1333
1334         /* all contents of S[0] can be moved into its neighbors
1335            S[0] will be removed after balancing. */
1336         if (!h && is_leaf_removable(tb))
1337                 return CARRY_ON;
1338
1339         /* why do we perform this check here rather than earlier??
1340            Answer: we can win 1 node in some cases above. Moreover we
1341            checked it above, when we checked, that S[0] is not removable
1342            in principle */
1343         if (sfree >= levbytes) {        /* new item fits into node S[h] without any shifting */
1344                 if (!h)
1345                         tb->s0num = vn->vn_nr_item;
1346                 set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1347                 return NO_BALANCING_NEEDED;
1348         }
1349
1350         {
1351                 int lpar, rpar, nset, lset, rset, lrset;
1352                 /* 
1353                  * regular overflowing of the node
1354                  */
1355
1356                 /* get_num_ver works in 2 modes (FLOW & NO_FLOW) 
1357                    lpar, rpar - number of items we can shift to left/right neighbor (including splitting item)
1358                    nset, lset, rset, lrset - shows, whether flowing items give better packing 
1359                  */
1360 #define FLOW 1
1361 #define NO_FLOW 0               /* do not any splitting */
1362
1363                 /* we choose one the following */
1364 #define NOTHING_SHIFT_NO_FLOW   0
1365 #define NOTHING_SHIFT_FLOW      5
1366 #define LEFT_SHIFT_NO_FLOW      10
1367 #define LEFT_SHIFT_FLOW         15
1368 #define RIGHT_SHIFT_NO_FLOW     20
1369 #define RIGHT_SHIFT_FLOW        25
1370 #define LR_SHIFT_NO_FLOW        30
1371 #define LR_SHIFT_FLOW           35
1372
1373                 lpar = tb->lnum[h];
1374                 rpar = tb->rnum[h];
1375
1376                 /* calculate number of blocks S[h] must be split into when
1377                    nothing is shifted to the neighbors,
1378                    as well as number of items in each part of the split node (s012 numbers),
1379                    and number of bytes (s1bytes) of the shared drop which flow to S1 if any */
1380                 nset = NOTHING_SHIFT_NO_FLOW;
1381                 nver = get_num_ver(vn->vn_mode, tb, h,
1382                                    0, -1, h ? vn->vn_nr_item : 0, -1,
1383                                    snum012, NO_FLOW);
1384
1385                 if (!h) {
1386                         int nver1;
1387
1388                         /* note, that in this case we try to bottle between S[0] and S1 (S1 - the first new node) */
1389                         nver1 = get_num_ver(vn->vn_mode, tb, h,
1390                                             0, -1, 0, -1,
1391                                             snum012 + NOTHING_SHIFT_FLOW, FLOW);
1392                         if (nver > nver1)
1393                                 nset = NOTHING_SHIFT_FLOW, nver = nver1;
1394                 }
1395
1396                 /* calculate number of blocks S[h] must be split into when
1397                    l_shift_num first items and l_shift_bytes of the right most
1398                    liquid item to be shifted are shifted to the left neighbor,
1399                    as well as number of items in each part of the splitted node (s012 numbers),
1400                    and number of bytes (s1bytes) of the shared drop which flow to S1 if any
1401                  */
1402                 lset = LEFT_SHIFT_NO_FLOW;
1403                 lnver = get_num_ver(vn->vn_mode, tb, h,
1404                                     lpar - ((h || tb->lbytes == -1) ? 0 : 1),
1405                                     -1, h ? vn->vn_nr_item : 0, -1,
1406                                     snum012 + LEFT_SHIFT_NO_FLOW, NO_FLOW);
1407                 if (!h) {
1408                         int lnver1;
1409
1410                         lnver1 = get_num_ver(vn->vn_mode, tb, h,
1411                                              lpar -
1412                                              ((tb->lbytes != -1) ? 1 : 0),
1413                                              tb->lbytes, 0, -1,
1414                                              snum012 + LEFT_SHIFT_FLOW, FLOW);
1415                         if (lnver > lnver1)
1416                                 lset = LEFT_SHIFT_FLOW, lnver = lnver1;
1417                 }
1418
1419                 /* calculate number of blocks S[h] must be split into when
1420                    r_shift_num first items and r_shift_bytes of the left most
1421                    liquid item to be shifted are shifted to the right neighbor,
1422                    as well as number of items in each part of the splitted node (s012 numbers),
1423                    and number of bytes (s1bytes) of the shared drop which flow to S1 if any
1424                  */
1425                 rset = RIGHT_SHIFT_NO_FLOW;
1426                 rnver = get_num_ver(vn->vn_mode, tb, h,
1427                                     0, -1,
1428                                     h ? (vn->vn_nr_item - rpar) : (rpar -
1429                                                                    ((tb->
1430                                                                      rbytes !=
1431                                                                      -1) ? 1 :
1432                                                                     0)), -1,
1433                                     snum012 + RIGHT_SHIFT_NO_FLOW, NO_FLOW);
1434                 if (!h) {
1435                         int rnver1;
1436
1437                         rnver1 = get_num_ver(vn->vn_mode, tb, h,
1438                                              0, -1,
1439                                              (rpar -
1440                                               ((tb->rbytes != -1) ? 1 : 0)),
1441                                              tb->rbytes,
1442                                              snum012 + RIGHT_SHIFT_FLOW, FLOW);
1443
1444                         if (rnver > rnver1)
1445                                 rset = RIGHT_SHIFT_FLOW, rnver = rnver1;
1446                 }
1447
1448                 /* calculate number of blocks S[h] must be split into when
1449                    items are shifted in both directions,
1450                    as well as number of items in each part of the splitted node (s012 numbers),
1451                    and number of bytes (s1bytes) of the shared drop which flow to S1 if any
1452                  */
1453                 lrset = LR_SHIFT_NO_FLOW;
1454                 lrnver = get_num_ver(vn->vn_mode, tb, h,
1455                                      lpar - ((h || tb->lbytes == -1) ? 0 : 1),
1456                                      -1,
1457                                      h ? (vn->vn_nr_item - rpar) : (rpar -
1458                                                                     ((tb->
1459                                                                       rbytes !=
1460                                                                       -1) ? 1 :
1461                                                                      0)), -1,
1462                                      snum012 + LR_SHIFT_NO_FLOW, NO_FLOW);
1463                 if (!h) {
1464                         int lrnver1;
1465
1466                         lrnver1 = get_num_ver(vn->vn_mode, tb, h,
1467                                               lpar -
1468                                               ((tb->lbytes != -1) ? 1 : 0),
1469                                               tb->lbytes,
1470                                               (rpar -
1471                                                ((tb->rbytes != -1) ? 1 : 0)),
1472                                               tb->rbytes,
1473                                               snum012 + LR_SHIFT_FLOW, FLOW);
1474                         if (lrnver > lrnver1)
1475                                 lrset = LR_SHIFT_FLOW, lrnver = lrnver1;
1476                 }
1477
1478                 /* Our general shifting strategy is:
1479                    1) to minimized number of new nodes;
1480                    2) to minimized number of neighbors involved in shifting;
1481                    3) to minimized number of disk reads; */
1482
1483                 /* we can win TWO or ONE nodes by shifting in both directions */
1484                 if (lrnver < lnver && lrnver < rnver) {
1485                         RFALSE(h &&
1486                                (tb->lnum[h] != 1 ||
1487                                 tb->rnum[h] != 1 ||
1488                                 lrnver != 1 || rnver != 2 || lnver != 2
1489                                 || h != 1), "vs-8230: bad h");
1490                         if (lrset == LR_SHIFT_FLOW)
1491                                 set_parameters(tb, h, tb->lnum[h], tb->rnum[h],
1492                                                lrnver, snum012 + lrset,
1493                                                tb->lbytes, tb->rbytes);
1494                         else
1495                                 set_parameters(tb, h,
1496                                                tb->lnum[h] -
1497                                                ((tb->lbytes == -1) ? 0 : 1),
1498                                                tb->rnum[h] -
1499                                                ((tb->rbytes == -1) ? 0 : 1),
1500                                                lrnver, snum012 + lrset, -1, -1);
1501
1502                         return CARRY_ON;
1503                 }
1504
1505                 /* if shifting doesn't lead to better packing then don't shift */
1506                 if (nver == lrnver) {
1507                         set_parameters(tb, h, 0, 0, nver, snum012 + nset, -1,
1508                                        -1);
1509                         return CARRY_ON;
1510                 }
1511
1512                 /* now we know that for better packing shifting in only one
1513                    direction either to the left or to the right is required */
1514
1515                 /*  if shifting to the left is better than shifting to the right */
1516                 if (lnver < rnver) {
1517                         SET_PAR_SHIFT_LEFT;
1518                         return CARRY_ON;
1519                 }
1520
1521                 /* if shifting to the right is better than shifting to the left */
1522                 if (lnver > rnver) {
1523                         SET_PAR_SHIFT_RIGHT;
1524                         return CARRY_ON;
1525                 }
1526
1527                 /* now shifting in either direction gives the same number
1528                    of nodes and we can make use of the cached neighbors */
1529                 if (is_left_neighbor_in_cache(tb, h)) {
1530                         SET_PAR_SHIFT_LEFT;
1531                         return CARRY_ON;
1532                 }
1533
1534                 /* shift to the right independently on whether the right neighbor in cache or not */
1535                 SET_PAR_SHIFT_RIGHT;
1536                 return CARRY_ON;
1537         }
1538 }
1539
1540 /* Check whether current node S[h] is balanced when Decreasing its size by
1541  * Deleting or Cutting for INTERNAL node of S+tree.
1542  * Calculate parameters for balancing for current level h.
1543  * Parameters:
1544  *      tb      tree_balance structure;
1545  *      h       current level of the node;
1546  *      inum    item number in S[h];
1547  *      mode    i - insert, p - paste;
1548  * Returns:     1 - schedule occurred; 
1549  *              0 - balancing for higher levels needed;
1550  *             -1 - no balancing for higher levels needed;
1551  *             -2 - no disk space.
1552  *
1553  * Note: Items of internal nodes have fixed size, so the balance condition for
1554  * the internal part of S+tree is as for the B-trees.
1555  */
1556 static int dc_check_balance_internal(struct tree_balance *tb, int h)
1557 {
1558         struct virtual_node *vn = tb->tb_vn;
1559
1560         /* Sh is the node whose balance is currently being checked,
1561            and Fh is its father.  */
1562         struct buffer_head *Sh, *Fh;
1563         int maxsize, n_ret_value;
1564         int lfree, rfree /* free space in L and R */ ;
1565
1566         Sh = PATH_H_PBUFFER(tb->tb_path, h);
1567         Fh = PATH_H_PPARENT(tb->tb_path, h);
1568
1569         maxsize = MAX_CHILD_SIZE(Sh);
1570
1571 /*   using tb->insert_size[h], which is negative in this case, create_virtual_node calculates: */
1572 /*   new_nr_item = number of items node would have if operation is */
1573 /*      performed without balancing (new_nr_item); */
1574         create_virtual_node(tb, h);
1575
1576         if (!Fh) {              /* S[h] is the root. */
1577                 if (vn->vn_nr_item > 0) {
1578                         set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1579                         return NO_BALANCING_NEEDED;     /* no balancing for higher levels needed */
1580                 }
1581                 /* new_nr_item == 0.
1582                  * Current root will be deleted resulting in
1583                  * decrementing the tree height. */
1584                 set_parameters(tb, h, 0, 0, 0, NULL, -1, -1);
1585                 return CARRY_ON;
1586         }
1587
1588         if ((n_ret_value = get_parents(tb, h)) != CARRY_ON)
1589                 return n_ret_value;
1590
1591         /* get free space of neighbors */
1592         rfree = get_rfree(tb, h);
1593         lfree = get_lfree(tb, h);
1594
1595         /* determine maximal number of items we can fit into neighbors */
1596         check_left(tb, h, lfree);
1597         check_right(tb, h, rfree);
1598
1599         if (vn->vn_nr_item >= MIN_NR_KEY(Sh)) { /* Balance condition for the internal node is valid.
1600                                                  * In this case we balance only if it leads to better packing. */
1601                 if (vn->vn_nr_item == MIN_NR_KEY(Sh)) { /* Here we join S[h] with one of its neighbors,
1602                                                          * which is impossible with greater values of new_nr_item. */
1603                         if (tb->lnum[h] >= vn->vn_nr_item + 1) {
1604                                 /* All contents of S[h] can be moved to L[h]. */
1605                                 int n;
1606                                 int order_L;
1607
1608                                 order_L =
1609                                     ((n =
1610                                       PATH_H_B_ITEM_ORDER(tb->tb_path,
1611                                                           h)) ==
1612                                      0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
1613                                 n = dc_size(B_N_CHILD(tb->FL[h], order_L)) /
1614                                     (DC_SIZE + KEY_SIZE);
1615                                 set_parameters(tb, h, -n - 1, 0, 0, NULL, -1,
1616                                                -1);
1617                                 return CARRY_ON;
1618                         }
1619
1620                         if (tb->rnum[h] >= vn->vn_nr_item + 1) {
1621                                 /* All contents of S[h] can be moved to R[h]. */
1622                                 int n;
1623                                 int order_R;
1624
1625                                 order_R =
1626                                     ((n =
1627                                       PATH_H_B_ITEM_ORDER(tb->tb_path,
1628                                                           h)) ==
1629                                      B_NR_ITEMS(Fh)) ? 0 : n + 1;
1630                                 n = dc_size(B_N_CHILD(tb->FR[h], order_R)) /
1631                                     (DC_SIZE + KEY_SIZE);
1632                                 set_parameters(tb, h, 0, -n - 1, 0, NULL, -1,
1633                                                -1);
1634                                 return CARRY_ON;
1635                         }
1636                 }
1637
1638                 if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) {
1639                         /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */
1640                         int to_r;
1641
1642                         to_r =
1643                             ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] -
1644                              tb->rnum[h] + vn->vn_nr_item + 1) / 2 -
1645                             (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]);
1646                         set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r,
1647                                        0, NULL, -1, -1);
1648                         return CARRY_ON;
1649                 }
1650
1651                 /* Balancing does not lead to better packing. */
1652                 set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1653                 return NO_BALANCING_NEEDED;
1654         }
1655
1656         /* Current node contain insufficient number of items. Balancing is required. */
1657         /* Check whether we can merge S[h] with left neighbor. */
1658         if (tb->lnum[h] >= vn->vn_nr_item + 1)
1659                 if (is_left_neighbor_in_cache(tb, h)
1660                     || tb->rnum[h] < vn->vn_nr_item + 1 || !tb->FR[h]) {
1661                         int n;
1662                         int order_L;
1663
1664                         order_L =
1665                             ((n =
1666                               PATH_H_B_ITEM_ORDER(tb->tb_path,
1667                                                   h)) ==
1668                              0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
1669                         n = dc_size(B_N_CHILD(tb->FL[h], order_L)) / (DC_SIZE +
1670                                                                       KEY_SIZE);
1671                         set_parameters(tb, h, -n - 1, 0, 0, NULL, -1, -1);
1672                         return CARRY_ON;
1673                 }
1674
1675         /* Check whether we can merge S[h] with right neighbor. */
1676         if (tb->rnum[h] >= vn->vn_nr_item + 1) {
1677                 int n;
1678                 int order_R;
1679
1680                 order_R =
1681                     ((n =
1682                       PATH_H_B_ITEM_ORDER(tb->tb_path,
1683                                           h)) == B_NR_ITEMS(Fh)) ? 0 : (n + 1);
1684                 n = dc_size(B_N_CHILD(tb->FR[h], order_R)) / (DC_SIZE +
1685                                                               KEY_SIZE);
1686                 set_parameters(tb, h, 0, -n - 1, 0, NULL, -1, -1);
1687                 return CARRY_ON;
1688         }
1689
1690         /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */
1691         if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) {
1692                 int to_r;
1693
1694                 to_r =
1695                     ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] +
1696                      vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 -
1697                                                 tb->rnum[h]);
1698                 set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL,
1699                                -1, -1);
1700                 return CARRY_ON;
1701         }
1702
1703         /* For internal nodes try to borrow item from a neighbor */
1704         RFALSE(!tb->FL[h] && !tb->FR[h], "vs-8235: trying to borrow for root");
1705
1706         /* Borrow one or two items from caching neighbor */
1707         if (is_left_neighbor_in_cache(tb, h) || !tb->FR[h]) {
1708                 int from_l;
1709
1710                 from_l =
1711                     (MAX_NR_KEY(Sh) + 1 - tb->lnum[h] + vn->vn_nr_item +
1712                      1) / 2 - (vn->vn_nr_item + 1);
1713                 set_parameters(tb, h, -from_l, 0, 1, NULL, -1, -1);
1714                 return CARRY_ON;
1715         }
1716
1717         set_parameters(tb, h, 0,
1718                        -((MAX_NR_KEY(Sh) + 1 - tb->rnum[h] + vn->vn_nr_item +
1719                           1) / 2 - (vn->vn_nr_item + 1)), 1, NULL, -1, -1);
1720         return CARRY_ON;
1721 }
1722
1723 /* Check whether current node S[h] is balanced when Decreasing its size by
1724  * Deleting or Truncating for LEAF node of S+tree.
1725  * Calculate parameters for balancing for current level h.
1726  * Parameters:
1727  *      tb      tree_balance structure;
1728  *      h       current level of the node;
1729  *      inum    item number in S[h];
1730  *      mode    i - insert, p - paste;
1731  * Returns:     1 - schedule occurred; 
1732  *              0 - balancing for higher levels needed;
1733  *             -1 - no balancing for higher levels needed;
1734  *             -2 - no disk space.
1735  */
1736 static int dc_check_balance_leaf(struct tree_balance *tb, int h)
1737 {
1738         struct virtual_node *vn = tb->tb_vn;
1739
1740         /* Number of bytes that must be deleted from
1741            (value is negative if bytes are deleted) buffer which
1742            contains node being balanced.  The mnemonic is that the
1743            attempted change in node space used level is levbytes bytes. */
1744         int levbytes;
1745         /* the maximal item size */
1746         int maxsize, n_ret_value;
1747         /* S0 is the node whose balance is currently being checked,
1748            and F0 is its father.  */
1749         struct buffer_head *S0, *F0;
1750         int lfree, rfree /* free space in L and R */ ;
1751
1752         S0 = PATH_H_PBUFFER(tb->tb_path, 0);
1753         F0 = PATH_H_PPARENT(tb->tb_path, 0);
1754
1755         levbytes = tb->insert_size[h];
1756
1757         maxsize = MAX_CHILD_SIZE(S0);   /* maximal possible size of an item */
1758
1759         if (!F0) {              /* S[0] is the root now. */
1760
1761                 RFALSE(-levbytes >= maxsize - B_FREE_SPACE(S0),
1762                        "vs-8240: attempt to create empty buffer tree");
1763
1764                 set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1765                 return NO_BALANCING_NEEDED;
1766         }
1767
1768         if ((n_ret_value = get_parents(tb, h)) != CARRY_ON)
1769                 return n_ret_value;
1770
1771         /* get free space of neighbors */
1772         rfree = get_rfree(tb, h);
1773         lfree = get_lfree(tb, h);
1774
1775         create_virtual_node(tb, h);
1776
1777         /* if 3 leaves can be merge to one, set parameters and return */
1778         if (are_leaves_removable(tb, lfree, rfree))
1779                 return CARRY_ON;
1780
1781         /* determine maximal number of items we can shift to the left/right  neighbor
1782            and the maximal number of bytes that can flow to the left/right neighbor
1783            from the left/right most liquid item that cannot be shifted from S[0] entirely
1784          */
1785         check_left(tb, h, lfree);
1786         check_right(tb, h, rfree);
1787
1788         /* check whether we can merge S with left neighbor. */
1789         if (tb->lnum[0] >= vn->vn_nr_item && tb->lbytes == -1)
1790                 if (is_left_neighbor_in_cache(tb, h) || ((tb->rnum[0] - ((tb->rbytes == -1) ? 0 : 1)) < vn->vn_nr_item) ||      /* S can not be merged with R */
1791                     !tb->FR[h]) {
1792
1793                         RFALSE(!tb->FL[h],
1794                                "vs-8245: dc_check_balance_leaf: FL[h] must exist");
1795
1796                         /* set parameter to merge S[0] with its left neighbor */
1797                         set_parameters(tb, h, -1, 0, 0, NULL, -1, -1);
1798                         return CARRY_ON;
1799                 }
1800
1801         /* check whether we can merge S[0] with right neighbor. */
1802         if (tb->rnum[0] >= vn->vn_nr_item && tb->rbytes == -1) {
1803                 set_parameters(tb, h, 0, -1, 0, NULL, -1, -1);
1804                 return CARRY_ON;
1805         }
1806
1807         /* All contents of S[0] can be moved to the neighbors (L[0] & R[0]). Set parameters and return */
1808         if (is_leaf_removable(tb))
1809                 return CARRY_ON;
1810
1811         /* Balancing is not required. */
1812         tb->s0num = vn->vn_nr_item;
1813         set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1814         return NO_BALANCING_NEEDED;
1815 }
1816
1817 /* Check whether current node S[h] is balanced when Decreasing its size by
1818  * Deleting or Cutting.
1819  * Calculate parameters for balancing for current level h.
1820  * Parameters:
1821  *      tb      tree_balance structure;
1822  *      h       current level of the node;
1823  *      inum    item number in S[h];
1824  *      mode    d - delete, c - cut.
1825  * Returns:     1 - schedule occurred; 
1826  *              0 - balancing for higher levels needed;
1827  *             -1 - no balancing for higher levels needed;
1828  *             -2 - no disk space.
1829  */
1830 static int dc_check_balance(struct tree_balance *tb, int h)
1831 {
1832         RFALSE(!(PATH_H_PBUFFER(tb->tb_path, h)),
1833                "vs-8250: S is not initialized");
1834
1835         if (h)
1836                 return dc_check_balance_internal(tb, h);
1837         else
1838                 return dc_check_balance_leaf(tb, h);
1839 }
1840
1841 /* Check whether current node S[h] is balanced.
1842  * Calculate parameters for balancing for current level h.
1843  * Parameters:
1844  *
1845  *      tb      tree_balance structure:
1846  *
1847  *              tb is a large structure that must be read about in the header file
1848  *              at the same time as this procedure if the reader is to successfully
1849  *              understand this procedure
1850  *
1851  *      h       current level of the node;
1852  *      inum    item number in S[h];
1853  *      mode    i - insert, p - paste, d - delete, c - cut.
1854  * Returns:     1 - schedule occurred; 
1855  *              0 - balancing for higher levels needed;
1856  *             -1 - no balancing for higher levels needed;
1857  *             -2 - no disk space.
1858  */
1859 static int check_balance(int mode,
1860                          struct tree_balance *tb,
1861                          int h,
1862                          int inum,
1863                          int pos_in_item,
1864                          struct item_head *ins_ih, const void *data)
1865 {
1866         struct virtual_node *vn;
1867
1868         vn = tb->tb_vn = (struct virtual_node *)(tb->vn_buf);
1869         vn->vn_free_ptr = (char *)(tb->tb_vn + 1);
1870         vn->vn_mode = mode;
1871         vn->vn_affected_item_num = inum;
1872         vn->vn_pos_in_item = pos_in_item;
1873         vn->vn_ins_ih = ins_ih;
1874         vn->vn_data = data;
1875
1876         RFALSE(mode == M_INSERT && !vn->vn_ins_ih,
1877                "vs-8255: ins_ih can not be 0 in insert mode");
1878
1879         if (tb->insert_size[h] > 0)
1880                 /* Calculate balance parameters when size of node is increasing. */
1881                 return ip_check_balance(tb, h);
1882
1883         /* Calculate balance parameters when  size of node is decreasing. */
1884         return dc_check_balance(tb, h);
1885 }
1886
1887 /* Check whether parent at the path is the really parent of the current node.*/
1888 static int get_direct_parent(struct tree_balance *p_s_tb, int n_h)
1889 {
1890         struct buffer_head *p_s_bh;
1891         struct path *p_s_path = p_s_tb->tb_path;
1892         int n_position,
1893             n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h);
1894
1895         /* We are in the root or in the new root. */
1896         if (n_path_offset <= FIRST_PATH_ELEMENT_OFFSET) {
1897
1898                 RFALSE(n_path_offset < FIRST_PATH_ELEMENT_OFFSET - 1,
1899                        "PAP-8260: invalid offset in the path");
1900
1901                 if (PATH_OFFSET_PBUFFER(p_s_path, FIRST_PATH_ELEMENT_OFFSET)->
1902                     b_blocknr == SB_ROOT_BLOCK(p_s_tb->tb_sb)) {
1903                         /* Root is not changed. */
1904                         PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1) = NULL;
1905                         PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1) = 0;
1906                         return CARRY_ON;
1907                 }
1908                 return REPEAT_SEARCH;   /* Root is changed and we must recalculate the path. */
1909         }
1910
1911         if (!B_IS_IN_TREE
1912             (p_s_bh = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1)))
1913                 return REPEAT_SEARCH;   /* Parent in the path is not in the tree. */
1914
1915         if ((n_position =
1916              PATH_OFFSET_POSITION(p_s_path,
1917                                   n_path_offset - 1)) > B_NR_ITEMS(p_s_bh))
1918                 return REPEAT_SEARCH;
1919
1920         if (B_N_CHILD_NUM(p_s_bh, n_position) !=
1921             PATH_OFFSET_PBUFFER(p_s_path, n_path_offset)->b_blocknr)
1922                 /* Parent in the path is not parent of the current node in the tree. */
1923                 return REPEAT_SEARCH;
1924
1925         if (buffer_locked(p_s_bh)) {
1926                 __wait_on_buffer(p_s_bh);
1927                 if (FILESYSTEM_CHANGED_TB(p_s_tb))
1928                         return REPEAT_SEARCH;
1929         }
1930
1931         return CARRY_ON;        /* Parent in the path is unlocked and really parent of the current node.  */
1932 }
1933
1934 /* Using lnum[n_h] and rnum[n_h] we should determine what neighbors
1935  * of S[n_h] we
1936  * need in order to balance S[n_h], and get them if necessary.
1937  * Returns:     SCHEDULE_OCCURRED - schedule occurred while the function worked;
1938  *              CARRY_ON - schedule didn't occur while the function worked;
1939  */
1940 static int get_neighbors(struct tree_balance *p_s_tb, int n_h)
1941 {
1942         int n_child_position,
1943             n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h + 1);
1944         unsigned long n_son_number;
1945         struct super_block *p_s_sb = p_s_tb->tb_sb;
1946         struct buffer_head *p_s_bh;
1947
1948         PROC_INFO_INC(p_s_sb, get_neighbors[n_h]);
1949
1950         if (p_s_tb->lnum[n_h]) {
1951                 /* We need left neighbor to balance S[n_h]. */
1952                 PROC_INFO_INC(p_s_sb, need_l_neighbor[n_h]);
1953                 p_s_bh = PATH_OFFSET_PBUFFER(p_s_tb->tb_path, n_path_offset);
1954
1955                 RFALSE(p_s_bh == p_s_tb->FL[n_h] &&
1956                        !PATH_OFFSET_POSITION(p_s_tb->tb_path, n_path_offset),
1957                        "PAP-8270: invalid position in the parent");
1958
1959                 n_child_position =
1960                     (p_s_bh ==
1961                      p_s_tb->FL[n_h]) ? p_s_tb->lkey[n_h] : B_NR_ITEMS(p_s_tb->
1962                                                                        FL[n_h]);
1963                 n_son_number = B_N_CHILD_NUM(p_s_tb->FL[n_h], n_child_position);
1964                 p_s_bh = sb_bread(p_s_sb, n_son_number);
1965                 if (!p_s_bh)
1966                         return IO_ERROR;
1967                 if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
1968                         decrement_bcount(p_s_bh);
1969                         PROC_INFO_INC(p_s_sb, get_neighbors_restart[n_h]);
1970                         return REPEAT_SEARCH;
1971                 }
1972
1973                 RFALSE(!B_IS_IN_TREE(p_s_tb->FL[n_h]) ||
1974                        n_child_position > B_NR_ITEMS(p_s_tb->FL[n_h]) ||
1975                        B_N_CHILD_NUM(p_s_tb->FL[n_h], n_child_position) !=
1976                        p_s_bh->b_blocknr, "PAP-8275: invalid parent");
1977                 RFALSE(!B_IS_IN_TREE(p_s_bh), "PAP-8280: invalid child");
1978                 RFALSE(!n_h &&
1979                        B_FREE_SPACE(p_s_bh) !=
1980                        MAX_CHILD_SIZE(p_s_bh) -
1981                        dc_size(B_N_CHILD(p_s_tb->FL[0], n_child_position)),
1982                        "PAP-8290: invalid child size of left neighbor");
1983
1984                 decrement_bcount(p_s_tb->L[n_h]);
1985                 p_s_tb->L[n_h] = p_s_bh;
1986         }
1987
1988         if (p_s_tb->rnum[n_h]) {        /* We need right neighbor to balance S[n_path_offset]. */
1989                 PROC_INFO_INC(p_s_sb, need_r_neighbor[n_h]);
1990                 p_s_bh = PATH_OFFSET_PBUFFER(p_s_tb->tb_path, n_path_offset);
1991
1992                 RFALSE(p_s_bh == p_s_tb->FR[n_h] &&
1993                        PATH_OFFSET_POSITION(p_s_tb->tb_path,
1994                                             n_path_offset) >=
1995                        B_NR_ITEMS(p_s_bh),
1996                        "PAP-8295: invalid position in the parent");
1997
1998                 n_child_position =
1999                     (p_s_bh == p_s_tb->FR[n_h]) ? p_s_tb->rkey[n_h] + 1 : 0;
2000                 n_son_number = B_N_CHILD_NUM(p_s_tb->FR[n_h], n_child_position);
2001                 p_s_bh = sb_bread(p_s_sb, n_son_number);
2002                 if (!p_s_bh)
2003                         return IO_ERROR;
2004                 if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
2005                         decrement_bcount(p_s_bh);
2006                         PROC_INFO_INC(p_s_sb, get_neighbors_restart[n_h]);
2007                         return REPEAT_SEARCH;
2008                 }
2009                 decrement_bcount(p_s_tb->R[n_h]);
2010                 p_s_tb->R[n_h] = p_s_bh;
2011
2012                 RFALSE(!n_h
2013                        && B_FREE_SPACE(p_s_bh) !=
2014                        MAX_CHILD_SIZE(p_s_bh) -
2015                        dc_size(B_N_CHILD(p_s_tb->FR[0], n_child_position)),
2016                        "PAP-8300: invalid child size of right neighbor (%d != %d - %d)",
2017                        B_FREE_SPACE(p_s_bh), MAX_CHILD_SIZE(p_s_bh),
2018                        dc_size(B_N_CHILD(p_s_tb->FR[0], n_child_position)));
2019
2020         }
2021         return CARRY_ON;
2022 }
2023
2024 #ifdef CONFIG_REISERFS_CHECK
2025 void *reiserfs_kmalloc(size_t size, int flags, struct super_block *s)
2026 {
2027         void *vp;
2028         static size_t malloced;
2029
2030         vp = kmalloc(size, flags);
2031         if (vp) {
2032                 REISERFS_SB(s)->s_kmallocs += size;
2033                 if (REISERFS_SB(s)->s_kmallocs > malloced + 200000) {
2034                         reiserfs_warning(s,
2035                                          "vs-8301: reiserfs_kmalloc: allocated memory %d",
2036                                          REISERFS_SB(s)->s_kmallocs);
2037                         malloced = REISERFS_SB(s)->s_kmallocs;
2038                 }
2039         }
2040         return vp;
2041 }
2042
2043 void reiserfs_kfree(const void *vp, size_t size, struct super_block *s)
2044 {
2045         kfree(vp);
2046
2047         REISERFS_SB(s)->s_kmallocs -= size;
2048         if (REISERFS_SB(s)->s_kmallocs < 0)
2049                 reiserfs_warning(s,
2050                                  "vs-8302: reiserfs_kfree: allocated memory %d",
2051                                  REISERFS_SB(s)->s_kmallocs);
2052
2053 }
2054 #endif
2055
2056 static int get_virtual_node_size(struct super_block *sb, struct buffer_head *bh)
2057 {
2058         int max_num_of_items;
2059         int max_num_of_entries;
2060         unsigned long blocksize = sb->s_blocksize;
2061
2062 #define MIN_NAME_LEN 1
2063
2064         max_num_of_items = (blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN);
2065         max_num_of_entries = (blocksize - BLKH_SIZE - IH_SIZE) /
2066             (DEH_SIZE + MIN_NAME_LEN);
2067
2068         return sizeof(struct virtual_node) +
2069             max(max_num_of_items * sizeof(struct virtual_item),
2070                 sizeof(struct virtual_item) + sizeof(struct direntry_uarea) +
2071                 (max_num_of_entries - 1) * sizeof(__u16));
2072 }
2073
2074 /* maybe we should fail balancing we are going to perform when kmalloc
2075    fails several times. But now it will loop until kmalloc gets
2076    required memory */
2077 static int get_mem_for_virtual_node(struct tree_balance *tb)
2078 {
2079         int check_fs = 0;
2080         int size;
2081         char *buf;
2082
2083         size = get_virtual_node_size(tb->tb_sb, PATH_PLAST_BUFFER(tb->tb_path));
2084
2085         if (size > tb->vn_buf_size) {
2086                 /* we have to allocate more memory for virtual node */
2087                 if (tb->vn_buf) {
2088                         /* free memory allocated before */
2089                         reiserfs_kfree(tb->vn_buf, tb->vn_buf_size, tb->tb_sb);
2090                         /* this is not needed if kfree is atomic */
2091                         check_fs = 1;
2092                 }
2093
2094                 /* virtual node requires now more memory */
2095                 tb->vn_buf_size = size;
2096
2097                 /* get memory for virtual item */
2098                 buf =
2099                     reiserfs_kmalloc(size, GFP_ATOMIC | __GFP_NOWARN,
2100                                      tb->tb_sb);
2101                 if (!buf) {
2102                         /* getting memory with GFP_KERNEL priority may involve
2103                            balancing now (due to indirect_to_direct conversion on
2104                            dcache shrinking). So, release path and collected
2105                            resources here */
2106                         free_buffers_in_tb(tb);
2107                         buf = reiserfs_kmalloc(size, GFP_NOFS, tb->tb_sb);
2108                         if (!buf) {
2109 #ifdef CONFIG_REISERFS_CHECK
2110                                 reiserfs_warning(tb->tb_sb,
2111                                                  "vs-8345: get_mem_for_virtual_node: "
2112                                                  "kmalloc failed. reiserfs kmalloced %d bytes",
2113                                                  REISERFS_SB(tb->tb_sb)->
2114                                                  s_kmallocs);
2115 #endif
2116                                 tb->vn_buf_size = 0;
2117                         }
2118                         tb->vn_buf = buf;
2119                         schedule();
2120                         return REPEAT_SEARCH;
2121                 }
2122
2123                 tb->vn_buf = buf;
2124         }
2125
2126         if (check_fs && FILESYSTEM_CHANGED_TB(tb))
2127                 return REPEAT_SEARCH;
2128
2129         return CARRY_ON;
2130 }
2131
2132 #ifdef CONFIG_REISERFS_CHECK
2133 static void tb_buffer_sanity_check(struct super_block *p_s_sb,
2134                                    struct buffer_head *p_s_bh,
2135                                    const char *descr, int level)
2136 {
2137         if (p_s_bh) {
2138                 if (atomic_read(&(p_s_bh->b_count)) <= 0) {
2139
2140                         reiserfs_panic(p_s_sb,
2141                                        "jmacd-1: tb_buffer_sanity_check(): negative or zero reference counter for buffer %s[%d] (%b)\n",
2142                                        descr, level, p_s_bh);
2143                 }
2144
2145                 if (!buffer_uptodate(p_s_bh)) {
2146                         reiserfs_panic(p_s_sb,
2147                                        "jmacd-2: tb_buffer_sanity_check(): buffer is not up to date %s[%d] (%b)\n",
2148                                        descr, level, p_s_bh);
2149                 }
2150
2151                 if (!B_IS_IN_TREE(p_s_bh)) {
2152                         reiserfs_panic(p_s_sb,
2153                                        "jmacd-3: tb_buffer_sanity_check(): buffer is not in tree %s[%d] (%b)\n",
2154                                        descr, level, p_s_bh);
2155                 }
2156
2157                 if (p_s_bh->b_bdev != p_s_sb->s_bdev) {
2158                         reiserfs_panic(p_s_sb,
2159                                        "jmacd-4: tb_buffer_sanity_check(): buffer has wrong device %s[%d] (%b)\n",
2160                                        descr, level, p_s_bh);
2161                 }
2162
2163                 if (p_s_bh->b_size != p_s_sb->s_blocksize) {
2164                         reiserfs_panic(p_s_sb,
2165                                        "jmacd-5: tb_buffer_sanity_check(): buffer has wrong blocksize %s[%d] (%b)\n",
2166                                        descr, level, p_s_bh);
2167                 }
2168
2169                 if (p_s_bh->b_blocknr > SB_BLOCK_COUNT(p_s_sb)) {
2170                         reiserfs_panic(p_s_sb,
2171                                        "jmacd-6: tb_buffer_sanity_check(): buffer block number too high %s[%d] (%b)\n",
2172                                        descr, level, p_s_bh);
2173                 }
2174         }
2175 }
2176 #else
2177 static void tb_buffer_sanity_check(struct super_block *p_s_sb,
2178                                    struct buffer_head *p_s_bh,
2179                                    const char *descr, int level)
2180 {;
2181 }
2182 #endif
2183
2184 static int clear_all_dirty_bits(struct super_block *s, struct buffer_head *bh)
2185 {
2186         return reiserfs_prepare_for_journal(s, bh, 0);
2187 }
2188
2189 static int wait_tb_buffers_until_unlocked(struct tree_balance *p_s_tb)
2190 {
2191         struct buffer_head *locked;
2192 #ifdef CONFIG_REISERFS_CHECK
2193         int repeat_counter = 0;
2194 #endif
2195         int i;
2196
2197         do {
2198
2199                 locked = NULL;
2200
2201                 for (i = p_s_tb->tb_path->path_length;
2202                      !locked && i > ILLEGAL_PATH_ELEMENT_OFFSET; i--) {
2203                         if (PATH_OFFSET_PBUFFER(p_s_tb->tb_path, i)) {
2204                                 /* if I understand correctly, we can only be sure the last buffer
2205                                  ** in the path is in the tree --clm
2206                                  */
2207 #ifdef CONFIG_REISERFS_CHECK
2208                                 if (PATH_PLAST_BUFFER(p_s_tb->tb_path) ==
2209                                     PATH_OFFSET_PBUFFER(p_s_tb->tb_path, i)) {
2210                                         tb_buffer_sanity_check(p_s_tb->tb_sb,
2211                                                                PATH_OFFSET_PBUFFER
2212                                                                (p_s_tb->tb_path,
2213                                                                 i), "S",
2214                                                                p_s_tb->tb_path->
2215                                                                path_length - i);
2216                                 }
2217 #endif
2218                                 if (!clear_all_dirty_bits(p_s_tb->tb_sb,
2219                                                           PATH_OFFSET_PBUFFER
2220                                                           (p_s_tb->tb_path,
2221                                                            i))) {
2222                                         locked =
2223                                             PATH_OFFSET_PBUFFER(p_s_tb->tb_path,
2224                                                                 i);
2225                                 }
2226                         }
2227                 }
2228
2229                 for (i = 0; !locked && i < MAX_HEIGHT && p_s_tb->insert_size[i];
2230                      i++) {
2231
2232                         if (p_s_tb->lnum[i]) {
2233
2234                                 if (p_s_tb->L[i]) {
2235                                         tb_buffer_sanity_check(p_s_tb->tb_sb,
2236                                                                p_s_tb->L[i],
2237                                                                "L", i);
2238                                         if (!clear_all_dirty_bits
2239                                             (p_s_tb->tb_sb, p_s_tb->L[i]))
2240                                                 locked = p_s_tb->L[i];
2241                                 }
2242
2243                                 if (!locked && p_s_tb->FL[i]) {
2244                                         tb_buffer_sanity_check(p_s_tb->tb_sb,
2245                                                                p_s_tb->FL[i],
2246                                                                "FL", i);
2247                                         if (!clear_all_dirty_bits
2248                                             (p_s_tb->tb_sb, p_s_tb->FL[i]))
2249                                                 locked = p_s_tb->FL[i];
2250                                 }
2251
2252                                 if (!locked && p_s_tb->CFL[i]) {
2253                                         tb_buffer_sanity_check(p_s_tb->tb_sb,
2254                                                                p_s_tb->CFL[i],
2255                                                                "CFL", i);
2256                                         if (!clear_all_dirty_bits
2257                                             (p_s_tb->tb_sb, p_s_tb->CFL[i]))
2258                                                 locked = p_s_tb->CFL[i];
2259                                 }
2260
2261                         }
2262
2263                         if (!locked && (p_s_tb->rnum[i])) {
2264
2265                                 if (p_s_tb->R[i]) {
2266                                         tb_buffer_sanity_check(p_s_tb->tb_sb,
2267                                                                p_s_tb->R[i],
2268                                                                "R", i);
2269                                         if (!clear_all_dirty_bits
2270                                             (p_s_tb->tb_sb, p_s_tb->R[i]))
2271                                                 locked = p_s_tb->R[i];
2272                                 }
2273
2274                                 if (!locked && p_s_tb->FR[i]) {
2275                                         tb_buffer_sanity_check(p_s_tb->tb_sb,
2276                                                                p_s_tb->FR[i],
2277                                                                "FR", i);
2278                                         if (!clear_all_dirty_bits
2279                                             (p_s_tb->tb_sb, p_s_tb->FR[i]))
2280                                                 locked = p_s_tb->FR[i];
2281                                 }
2282
2283                                 if (!locked && p_s_tb->CFR[i]) {
2284                                         tb_buffer_sanity_check(p_s_tb->tb_sb,
2285                                                                p_s_tb->CFR[i],
2286                                                                "CFR", i);
2287                                         if (!clear_all_dirty_bits
2288                                             (p_s_tb->tb_sb, p_s_tb->CFR[i]))
2289                                                 locked = p_s_tb->CFR[i];
2290                                 }
2291                         }
2292                 }
2293                 /* as far as I can tell, this is not required.  The FEB list seems
2294                  ** to be full of newly allocated nodes, which will never be locked,
2295                  ** dirty, or anything else.
2296                  ** To be safe, I'm putting in the checks and waits in.  For the moment,
2297                  ** they are needed to keep the code in journal.c from complaining
2298                  ** about the buffer.  That code is inside CONFIG_REISERFS_CHECK as well.
2299                  ** --clm
2300                  */
2301                 for (i = 0; !locked && i < MAX_FEB_SIZE; i++) {
2302                         if (p_s_tb->FEB[i]) {
2303                                 if (!clear_all_dirty_bits
2304                                     (p_s_tb->tb_sb, p_s_tb->FEB[i]))
2305                                         locked = p_s_tb->FEB[i];
2306                         }
2307                 }
2308
2309                 if (locked) {
2310 #ifdef CONFIG_REISERFS_CHECK
2311                         repeat_counter++;
2312                         if ((repeat_counter % 10000) == 0) {
2313                                 reiserfs_warning(p_s_tb->tb_sb,
2314                                                  "wait_tb_buffers_until_released(): too many "
2315                                                  "iterations waiting for buffer to unlock "
2316                                                  "(%b)", locked);
2317
2318                                 /* Don't loop forever.  Try to recover from possible error. */
2319
2320                                 return (FILESYSTEM_CHANGED_TB(p_s_tb)) ?
2321                                     REPEAT_SEARCH : CARRY_ON;
2322                         }
2323 #endif
2324                         __wait_on_buffer(locked);
2325                         if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
2326                                 return REPEAT_SEARCH;
2327                         }
2328                 }
2329
2330         } while (locked);
2331
2332         return CARRY_ON;
2333 }
2334
2335 /* Prepare for balancing, that is
2336  *      get all necessary parents, and neighbors;
2337  *      analyze what and where should be moved;
2338  *      get sufficient number of new nodes;
2339  * Balancing will start only after all resources will be collected at a time.
2340  * 
2341  * When ported to SMP kernels, only at the last moment after all needed nodes
2342  * are collected in cache, will the resources be locked using the usual
2343  * textbook ordered lock acquisition algorithms.  Note that ensuring that
2344  * this code neither write locks what it does not need to write lock nor locks out of order
2345  * will be a pain in the butt that could have been avoided.  Grumble grumble. -Hans
2346  * 
2347  * fix is meant in the sense of render unchanging
2348  * 
2349  * Latency might be improved by first gathering a list of what buffers are needed
2350  * and then getting as many of them in parallel as possible? -Hans
2351  *
2352  * Parameters:
2353  *      op_mode i - insert, d - delete, c - cut (truncate), p - paste (append)
2354  *      tb      tree_balance structure;
2355  *      inum    item number in S[h];
2356  *      pos_in_item - comment this if you can
2357  *      ins_ih & ins_sd are used when inserting
2358  * Returns:     1 - schedule occurred while the function worked;
2359  *              0 - schedule didn't occur while the function worked;
2360  *             -1 - if no_disk_space 
2361  */
2362
2363 int fix_nodes(int n_op_mode, struct tree_balance *p_s_tb, struct item_head *p_s_ins_ih, // item head of item being inserted
2364               const void *data  // inserted item or data to be pasted
2365     )
2366 {
2367         int n_ret_value, n_h, n_item_num = PATH_LAST_POSITION(p_s_tb->tb_path);
2368         int n_pos_in_item;
2369
2370         /* we set wait_tb_buffers_run when we have to restore any dirty bits cleared
2371          ** during wait_tb_buffers_run
2372          */
2373         int wait_tb_buffers_run = 0;
2374         struct buffer_head *p_s_tbS0 = PATH_PLAST_BUFFER(p_s_tb->tb_path);
2375
2376         ++REISERFS_SB(p_s_tb->tb_sb)->s_fix_nodes;
2377
2378         n_pos_in_item = p_s_tb->tb_path->pos_in_item;
2379
2380         p_s_tb->fs_gen = get_generation(p_s_tb->tb_sb);
2381
2382         /* we prepare and log the super here so it will already be in the
2383          ** transaction when do_balance needs to change it.
2384          ** This way do_balance won't have to schedule when trying to prepare
2385          ** the super for logging
2386          */
2387         reiserfs_prepare_for_journal(p_s_tb->tb_sb,
2388                                      SB_BUFFER_WITH_SB(p_s_tb->tb_sb), 1);
2389         journal_mark_dirty(p_s_tb->transaction_handle, p_s_tb->tb_sb,
2390                            SB_BUFFER_WITH_SB(p_s_tb->tb_sb));
2391         if (FILESYSTEM_CHANGED_TB(p_s_tb))
2392                 return REPEAT_SEARCH;
2393
2394         /* if it possible in indirect_to_direct conversion */
2395         if (buffer_locked(p_s_tbS0)) {
2396                 __wait_on_buffer(p_s_tbS0);
2397                 if (FILESYSTEM_CHANGED_TB(p_s_tb))
2398                         return REPEAT_SEARCH;
2399         }
2400 #ifdef CONFIG_REISERFS_CHECK
2401         if (cur_tb) {
2402                 print_cur_tb("fix_nodes");
2403                 reiserfs_panic(p_s_tb->tb_sb,
2404                                "PAP-8305: fix_nodes:  there is pending do_balance");
2405         }
2406
2407         if (!buffer_uptodate(p_s_tbS0) || !B_IS_IN_TREE(p_s_tbS0)) {
2408                 reiserfs_panic(p_s_tb->tb_sb,
2409                                "PAP-8320: fix_nodes: S[0] (%b %z) is not uptodate "
2410                                "at the beginning of fix_nodes or not in tree (mode %c)",
2411                                p_s_tbS0, p_s_tbS0, n_op_mode);
2412         }
2413
2414         /* Check parameters. */
2415         switch (n_op_mode) {
2416         case M_INSERT:
2417                 if (n_item_num <= 0 || n_item_num > B_NR_ITEMS(p_s_tbS0))
2418                         reiserfs_panic(p_s_tb->tb_sb,
2419                                        "PAP-8330: fix_nodes: Incorrect item number %d (in S0 - %d) in case of insert",
2420                                        n_item_num, B_NR_ITEMS(p_s_tbS0));
2421                 break;
2422         case M_PASTE:
2423         case M_DELETE:
2424         case M_CUT:
2425                 if (n_item_num < 0 || n_item_num >= B_NR_ITEMS(p_s_tbS0)) {
2426                         print_block(p_s_tbS0, 0, -1, -1);
2427                         reiserfs_panic(p_s_tb->tb_sb,
2428                                        "PAP-8335: fix_nodes: Incorrect item number(%d); mode = %c insert_size = %d\n",
2429                                        n_item_num, n_op_mode,
2430                                        p_s_tb->insert_size[0]);
2431                 }
2432                 break;
2433         default:
2434                 reiserfs_panic(p_s_tb->tb_sb,
2435                                "PAP-8340: fix_nodes: Incorrect mode of operation");
2436         }
2437 #endif
2438
2439         if (get_mem_for_virtual_node(p_s_tb) == REPEAT_SEARCH)
2440                 // FIXME: maybe -ENOMEM when tb->vn_buf == 0? Now just repeat
2441                 return REPEAT_SEARCH;
2442
2443         /* Starting from the leaf level; for all levels n_h of the tree. */
2444         for (n_h = 0; n_h < MAX_HEIGHT && p_s_tb->insert_size[n_h]; n_h++) {
2445                 if ((n_ret_value = get_direct_parent(p_s_tb, n_h)) != CARRY_ON) {
2446                         goto repeat;
2447                 }
2448
2449                 if ((n_ret_value =
2450                      check_balance(n_op_mode, p_s_tb, n_h, n_item_num,
2451                                    n_pos_in_item, p_s_ins_ih,
2452                                    data)) != CARRY_ON) {
2453                         if (n_ret_value == NO_BALANCING_NEEDED) {
2454                                 /* No balancing for higher levels needed. */
2455                                 if ((n_ret_value =
2456                                      get_neighbors(p_s_tb, n_h)) != CARRY_ON) {
2457                                         goto repeat;
2458                                 }
2459                                 if (n_h != MAX_HEIGHT - 1)
2460                                         p_s_tb->insert_size[n_h + 1] = 0;
2461                                 /* ok, analysis and resource gathering are complete */
2462                                 break;
2463                         }
2464                         goto repeat;
2465                 }
2466
2467                 if ((n_ret_value = get_neighbors(p_s_tb, n_h)) != CARRY_ON) {
2468                         goto repeat;
2469                 }
2470
2471                 if ((n_ret_value = get_empty_nodes(p_s_tb, n_h)) != CARRY_ON) {
2472                         goto repeat;    /* No disk space, or schedule occurred and
2473                                            analysis may be invalid and needs to be redone. */
2474                 }
2475
2476                 if (!PATH_H_PBUFFER(p_s_tb->tb_path, n_h)) {
2477                         /* We have a positive insert size but no nodes exist on this
2478                            level, this means that we are creating a new root. */
2479
2480                         RFALSE(p_s_tb->blknum[n_h] != 1,
2481                                "PAP-8350: creating new empty root");
2482
2483                         if (n_h < MAX_HEIGHT - 1)
2484                                 p_s_tb->insert_size[n_h + 1] = 0;
2485                 } else if (!PATH_H_PBUFFER(p_s_tb->tb_path, n_h + 1)) {
2486                         if (p_s_tb->blknum[n_h] > 1) {
2487                                 /* The tree needs to be grown, so this node S[n_h]
2488                                    which is the root node is split into two nodes,
2489                                    and a new node (S[n_h+1]) will be created to
2490                                    become the root node.  */
2491
2492                                 RFALSE(n_h == MAX_HEIGHT - 1,
2493                                        "PAP-8355: attempt to create too high of a tree");
2494
2495                                 p_s_tb->insert_size[n_h + 1] =
2496                                     (DC_SIZE +
2497                                      KEY_SIZE) * (p_s_tb->blknum[n_h] - 1) +
2498                                     DC_SIZE;
2499                         } else if (n_h < MAX_HEIGHT - 1)
2500                                 p_s_tb->insert_size[n_h + 1] = 0;
2501                 } else
2502                         p_s_tb->insert_size[n_h + 1] =
2503                             (DC_SIZE + KEY_SIZE) * (p_s_tb->blknum[n_h] - 1);
2504         }
2505
2506         if ((n_ret_value = wait_tb_buffers_until_unlocked(p_s_tb)) == CARRY_ON) {
2507                 if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
2508                         wait_tb_buffers_run = 1;
2509                         n_ret_value = REPEAT_SEARCH;
2510                         goto repeat;
2511                 } else {
2512                         return CARRY_ON;
2513                 }
2514         } else {
2515                 wait_tb_buffers_run = 1;
2516                 goto repeat;
2517         }
2518
2519       repeat:
2520         // fix_nodes was unable to perform its calculation due to
2521         // filesystem got changed under us, lack of free disk space or i/o
2522         // failure. If the first is the case - the search will be
2523         // repeated. For now - free all resources acquired so far except
2524         // for the new allocated nodes
2525         {
2526                 int i;
2527
2528                 /* Release path buffers. */
2529                 if (wait_tb_buffers_run) {
2530                         pathrelse_and_restore(p_s_tb->tb_sb, p_s_tb->tb_path);
2531                 } else {
2532                         pathrelse(p_s_tb->tb_path);
2533                 }
2534                 /* brelse all resources collected for balancing */
2535                 for (i = 0; i < MAX_HEIGHT; i++) {
2536                         if (wait_tb_buffers_run) {
2537                                 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
2538                                                                  p_s_tb->L[i]);
2539                                 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
2540                                                                  p_s_tb->R[i]);
2541                                 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
2542                                                                  p_s_tb->FL[i]);
2543                                 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
2544                                                                  p_s_tb->FR[i]);
2545                                 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
2546                                                                  p_s_tb->
2547                                                                  CFL[i]);
2548                                 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
2549                                                                  p_s_tb->
2550                                                                  CFR[i]);
2551                         }
2552
2553                         brelse(p_s_tb->L[i]);
2554                         p_s_tb->L[i] = NULL;
2555                         brelse(p_s_tb->R[i]);
2556                         p_s_tb->R[i] = NULL;
2557                         brelse(p_s_tb->FL[i]);
2558                         p_s_tb->FL[i] = NULL;
2559                         brelse(p_s_tb->FR[i]);
2560                         p_s_tb->FR[i] = NULL;
2561                         brelse(p_s_tb->CFL[i]);
2562                         p_s_tb->CFL[i] = NULL;
2563                         brelse(p_s_tb->CFR[i]);
2564                         p_s_tb->CFR[i] = NULL;
2565                 }
2566
2567                 if (wait_tb_buffers_run) {
2568                         for (i = 0; i < MAX_FEB_SIZE; i++) {
2569                                 if (p_s_tb->FEB[i]) {
2570                                         reiserfs_restore_prepared_buffer
2571                                             (p_s_tb->tb_sb, p_s_tb->FEB[i]);
2572                                 }
2573                         }
2574                 }
2575                 return n_ret_value;
2576         }
2577
2578 }
2579
2580 /* Anatoly will probably forgive me renaming p_s_tb to tb. I just
2581    wanted to make lines shorter */
2582 void unfix_nodes(struct tree_balance *tb)
2583 {
2584         int i;
2585
2586         /* Release path buffers. */
2587         pathrelse_and_restore(tb->tb_sb, tb->tb_path);
2588
2589         /* brelse all resources collected for balancing */
2590         for (i = 0; i < MAX_HEIGHT; i++) {
2591                 reiserfs_restore_prepared_buffer(tb->tb_sb, tb->L[i]);
2592                 reiserfs_restore_prepared_buffer(tb->tb_sb, tb->R[i]);
2593                 reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FL[i]);
2594                 reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FR[i]);
2595                 reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFL[i]);
2596                 reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFR[i]);
2597
2598                 brelse(tb->L[i]);
2599                 brelse(tb->R[i]);
2600                 brelse(tb->FL[i]);
2601                 brelse(tb->FR[i]);
2602                 brelse(tb->CFL[i]);
2603                 brelse(tb->CFR[i]);
2604         }
2605
2606         /* deal with list of allocated (used and unused) nodes */
2607         for (i = 0; i < MAX_FEB_SIZE; i++) {
2608                 if (tb->FEB[i]) {
2609                         b_blocknr_t blocknr = tb->FEB[i]->b_blocknr;
2610                         /* de-allocated block which was not used by balancing and
2611                            bforget about buffer for it */
2612                         brelse(tb->FEB[i]);
2613                         reiserfs_free_block(tb->transaction_handle, NULL,
2614                                             blocknr, 0);
2615                 }
2616                 if (tb->used[i]) {
2617                         /* release used as new nodes including a new root */
2618                         brelse(tb->used[i]);
2619                 }
2620         }
2621
2622         if (tb->vn_buf)
2623                 reiserfs_kfree(tb->vn_buf, tb->vn_buf_size, tb->tb_sb);
2624
2625 }