aboutsummaryrefslogtreecommitdiffstats
path: root/fs/tux3/tux3.h
blob: 4b3c4fb980c62d69736472bc5394b1e1d707f947 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
#ifndef TUX3_H
#define TUX3_H

#ifdef __KERNEL__
#include <linux/version.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/buffer_head.h>
#include <linux/bio.h>
#include <linux/blkdev.h>	/* for struct blk_plug */
#include <linux/mutex.h>
#include <linux/magic.h>
#include <linux/slab.h>
#include <linux/xattr.h>
#include <linux/list_sort.h>

#include "trace.h"
#include "buffer.h"
#endif /* !__KERNEL__ */

#include "link.h"

#define fieldtype(compound, field) typeof(((compound *)NULL)->field)
#define vecset(d, v, n) memset((d), (v), (n) * sizeof(*(d)))
#define veccopy(d, s, n) memcpy((d), (s), (n) * sizeof(*(d)))
#define vecmove(d, s, n) memmove((d), (s), (n) * sizeof(*(d)))

typedef u64 inum_t;
typedef u64 tuxkey_t;

static inline void *encode16(void *at, unsigned val)
{
	*(__be16 *)at = cpu_to_be16(val);
	return at + sizeof(u16);
}

static inline void *encode32(void *at, unsigned val)
{
	*(__be32 *)at = cpu_to_be32(val);
	return at + sizeof(u32);
}

static inline void *encode64(void *at, u64 val)
{
	*(__be64 *)at = cpu_to_be64(val);
	return at + sizeof(u64);
}

static inline void *encode48(void *at, u64 val)
{
	at = encode16(at, val >> 32);
	return encode32(at, val);
}

static inline void *decode16(void *at, unsigned *val)
{
	*val = be16_to_cpup(at);
	return at + sizeof(u16);
}

static inline void *decode32(void *at, unsigned *val)
{
	*val = be32_to_cpup(at);
	return at + sizeof(u32);
}

static inline void *decode64(void *at, u64 *val)
{
	*val = be64_to_cpup(at);
	return at + sizeof(u64);
}

static inline void *decode48(void *at, u64 *val)
{
	unsigned part1, part2;
	at = decode16(at, &part1);
	at = decode32(at, &part2);
	*val = (u64)part1 << 32 | part2;
	return at;
}

/* Tux3 disk format */

/*
 * TUX3_LABEL includes the date of the last incompatible disk format change
 * NOTE: Always update this history for each incompatible change!
 *
 * Disk Format Revision History
 *
 * 2008-08-06: Beginning of time
 * 2008-09-06: Actual checking starts
 * 2008-12-12: Atom dictionary size in disksuper instead of atable->i_size
 * 2009-02-28: Attributes renumbered, rdev added
 * 2009-03-10: Alignment fix of disksuper
 * 2012-02-16: Update for atomic commit
 * 2012-07-02: Use timestamp 32.32 fixed point. Increase log_balloc size.
 * 2012-12-20: Add ->usedinodes
 * 2014-03-27: Change internal inum numbers. Change btree->root.depth.
 * 2014-05-06: Change timestamp format to nanosecond.
 */
#define TUX3_MAGIC		{ 't', 'u', 'x', '3', 0x20, 0x14, 0x05, 0x06 }
#define TUX3_MAGIC_STR					\
	((typeof(((struct disksuper *)0)->magic))TUX3_MAGIC)

#define TUX3_MAGIC_LOG		0x10ad
#define TUX3_MAGIC_BNODE	0xb4de
#define TUX3_MAGIC_DLEAF	0xbeaf
#define TUX3_MAGIC_ILEAF	0x90de
#define TUX3_MAGIC_OLEAF	0x6eaf

/* Number of available inum ("0" - "((1 << 48) - 1)") */
#define MAX_INODES_BITS		48
#define MAX_INODES		((u64)1 << MAX_INODES_BITS)
/* Maximum number of block address ("0" - "((1 << 48) - 1)") */
#define MAX_BLOCKS_BITS		48
#define MAX_BLOCKS		((block_t)1 << MAX_BLOCKS_BITS)

#define SB_LOC			(1 << 12)
#define SB_LEN			(1 << 12)	/* this is maximum blocksize */

#define MAX_TUXKEY		(((tuxkey_t)1 << 48) - 1)
#define TUXKEY_LIMIT		(MAX_TUXKEY + 1)

/* Special inode numbers */
#define TUX_INVALID_INO		0	/* FIXME: just for debugging */
#define TUX_BITMAP_INO		1
#define TUX_COUNTMAP_INO	2	/* Block group free count map */
#define TUX_VTABLE_INO		3
#define TUX_ATABLE_INO		4
#define TUX_VOLMAP_INO		61	/* This doesn't have entry in ileaf */
#define TUX_LOGMAP_INO		62	/* This is volmap for log blocks */
#define TUX_ROOTDIR_INO		63
#define TUX_NORMAL_INO		64	/* until this ino, reserved ino */

struct disksuper {
	/* Update magic on any incompatible format change */
	char magic[8];		/* Contains TUX3_LABEL magic string */
	__be64 birthdate;	/* Volume creation date */
	__be64 flags;		/* Need to assign some flags */
	__be16 blockbits;	/* Shift to get volume block size */
	__be16 unused[3];	/* Padding for alignment */
	__be64 volblocks;	/* Volume size */

	/* The rest should be moved to a "metablock" that is updated frequently */
	__be64 iroot;		/* Root of the itree */
	__be64 oroot;		/* Root of the otree */
	__be64 usedinodes;	/* Number of using inode numbers.  (Instead of
				 * free inode numbers).  With this, we can
				 * change the maximum inodes without changing
				 * usedinodes on disk.
				 */
	__be64 nextblock;	/* Get rid of this when we have a real allocation policy */
	__be64 atomdictsize;	/*
				 * Size of the atom dictionary instead of i_size
				 * FIXME: we are better to remove this somehow
				 */
	__be32 freeatom;	/* Beginning of persistent free atom list in atable */
	__be32 atomgen;		/* Next atom number if there are no free atoms */
	__be64 logchain;	/* Most recent delta commit block pointer */
	__be32 logcount;	/* Count of log blocks in the current log chain */
} __packed;

enum { MAX_DIRECT_COUNT = SHRT_MAX };

/* FIXME: maybe better to remove struct root to reduce holes in structure? */
struct root {
	unsigned short direct; /* block/depth is an extent instead of btree */
	union {
		short count;	/* Number of blocks in direct extent */
		short depth;	/* Btree levels include leaf level */
	};
	block_t block;	/* Disk location of btree root */
};

struct btree {
	struct rw_semaphore lock;
	struct sb *sb;		/* Convenience to reduce parameter list size */
	struct btree_ops *ops;	/* Generic btree low level operations */
	struct root root;	/* Cached description of btree root */
	u16 entries_per_leaf;	/* Used in btree leaf splitting */
};

/* Define layout of btree root on disk, endian conversion is elsewhere. */

static inline u64 pack_root(struct root *root)
{
	return (u64)root->direct << 63 | (u64)root->depth << 48 | root->block;
}

static inline struct root unpack_root(u64 v)
{
	return (struct root){
		.direct = v >> 63,
		.depth = (v >> 48) & 0x7fff,
		.block = v & (-1ULL >> 16),
	};
}

/* Path cursor for btree traversal */

struct cursor {
	struct btree *btree;
#define CURSOR_DEBUG
#ifdef CURSOR_DEBUG
#define FREE_BUFFER	((void *)0xdbc06505)
#define FREE_NEXT	((void *)0xdbc06507)
	int maxlevel;
#endif
	int level;
	struct path_level {
		struct buffer_head *buffer;
		struct index_entry *next;
	} path[];
};

struct stash { struct flink_head head; u64 *pos, *top; };

/* Flush synchronously */
#define TUX3_FLUSHER_SYNC		1
/* Flush asynchronously by own timing */
#define TUX3_FLUSHER_ASYNC_OWN		2
/* Flush asynchronously by kernel normal timing (by hackish way) */
#define TUX3_FLUSHER_ASYNC_HACK		3

/* Refcount for delta */
struct delta_ref {
	atomic_t refcount;
	unsigned delta;
#ifdef UNIFY_DEBUG
	int unify_flag;	/* FIXME: is there better way? */
#endif
};

/* Per-delta data structure for sb */
struct sb_delta_dirty {
	struct list_head dirty_inodes;	/* dirty inodes list */
};

/* Pin a block in cache and keep a pointer to it */
struct countmap_pin {
	struct buffer_head *buffer;
};

struct tux3_idefer_map;
/* Tux3-specific sb is a handle for the entire volume state */
struct sb {
	union {
		struct disksuper super;
		char thisbig[SB_LEN];
	};

#if TUX3_FLUSHER == TUX3_FLUSHER_SYNC
	struct rw_semaphore delta_lock;		/* delta transition exclusive */
#endif
	struct delta_ref __rcu *current_delta;	/* current delta */
	struct delta_ref delta_refs[TUX3_MAX_DELTA];
	unsigned next_delta;			/* delta commit cycle */
	unsigned unify;				/* log unify cycle */

#define TUX3_COMMIT_RUNNING_BIT		0
#define TUX3_COMMIT_PENDING_BIT		1
	unsigned long backend_state;		/* delta state */
#ifdef UNIFY_DEBUG
	struct delta_ref *pending_delta;	/* pending delta for commit */
#endif
	unsigned marshal_delta;			/* marshaling delta */
	unsigned committed_delta;		/* committed delta */
	wait_queue_head_t delta_event_wq;	/* wait queue for delta event */
#if TUX3_FLUSHER == TUX3_FLUSHER_ASYNC_OWN
	struct task_struct *flush_task;		/* work to flush delta */
#endif
#if TUX3_FLUSHER == TUX3_FLUSHER_ASYNC_HACK
	struct backing_dev_info bdi;
#endif

	struct btree itree;	/* Inode btree */
	struct btree otree;	/* Orphan btree */
	struct inode *volmap;	/* Volume metadata cache (like blockdev). */
	struct inode *bitmap;	/* allocation bitmap special file */
	struct inode *countmap;	/* block group free count map */
	struct inode *rootdir;	/* root directory special file */
	struct inode *vtable;	/* version table special file */
	struct inode *atable;	/* xattr atom special file */

	unsigned blocksize, blockbits, blockmask, groupbits;
	u64 freeinodes;		/* Number of free inode numbers. This is
				 * including the deferred allocated inodes */
	block_t volblocks, volmask, freeblocks, nextblock;
	inum_t nextinum;	/* FIXME: temporary hack to avoid to find
				 * same area in itree for free inum. */
	unsigned entries_per_node; /* must be per-btree type, get rid of this */
	unsigned version;	/* Currently mounted volume version view */

	unsigned atomref_base;	/* Index of atom refcount base */
	unsigned unatom_base;	/* Index of unatom base */
	loff_t atomdictsize;	/* Atom dictionary size */
	unsigned freeatom;	/* Start of free atom list in atom table */
	unsigned atomgen;	/* Next atom number to allocate if no free atoms */

	/*
	 * For backend only
	 */
	struct inode *logmap;	/* Log block cache */
	unsigned lognext;	/* Index of next log block in log map */
	struct buffer_head *logbuf; /* Cached log block */
	unsigned char *logpos, *logtop; /* Where to emit next log entry */

	struct list_head orphan_add; /* defered orphan inode add list */
	struct list_head orphan_del; /* defered orphan inode del list */

	struct stash defree;	/* defer extent frees until after delta */
	struct stash deunify;	/* defer extent frees until after unify */

	struct list_head unify_buffers; /* dirty metadata flushed at unify */

	struct iowait *iowait;		/* helper for waiting I/O */

	/*
	 * For frontend and backend
	 */
	spinlock_t countmap_lock;
	struct countmap_pin countmap_pin;
	struct tux3_idefer_map *idefer_map;
	struct list_head alloc_inodes;	/* deferred inum allocation inodes */

	spinlock_t forked_buffers_lock;
	struct link forked_buffers;	/* forked buffers list */

	spinlock_t dirty_inodes_lock;	/* lock of dirty_inodes for frontend */
	/* Per-delta dirty data for sb */
	struct sb_delta_dirty s_ddc[TUX3_MAX_DELTA];
#ifdef __KERNEL__
	struct super_block *vfs_sb;	/* Generic kernel superblock */
#else
	struct dev *dev;		/* userspace block device */
	loff_t s_maxbytes;		/* maximum file size */
#endif
};

/* Block segment (physical block extent) info */
#define BLOCK_SEG_HOLE		(1 << 0)
#define BLOCK_SEG_NEW		(1 << 1)

struct block_segment {
	block_t block;		/* Start of physical address */
	unsigned count;		/* Number of blocks */
	unsigned state;		/* State of this segment */
};

/*
 * Balloc flags
 */
/* Allow fewer allocation than requested */
#define BALLOC_PARTIAL		(1 << 0)

/* logging  */

struct logblock {
	__be16 magic;		/* Magic number */
	__be16 bytes;		/* Total data bytes on this block */
	u32 unused;		/* padding */
	__be64 logchain;	/* Block number to previous logblock */
	unsigned char data[];	/* Log data */
};

enum {
	LOG_BALLOC = 0x33,	/* Log of block allocation */
	LOG_BFREE,		/* Log of freeing block after delta */
	LOG_BFREE_ON_UNIFY,	/* Log of freeing block after unify */
	LOG_BFREE_RELOG,	/* LOG_BFREE, but re-log of free after unify */
	LOG_LEAF_REDIRECT,	/* Log of leaf redirect */
	LOG_LEAF_FREE,		/* Log of freeing leaf */
	LOG_BNODE_REDIRECT,	/* Log of bnode redirect */
	LOG_BNODE_ROOT,		/* Log of new bnode root allocation */
	LOG_BNODE_SPLIT,	/* Log of spliting bnode to new bnode */
	LOG_BNODE_ADD,		/* Log of adding bnode index */
	LOG_BNODE_UPDATE,	/* Log of bnode index ->block update */
	LOG_BNODE_MERGE,	/* Log of merging 2 bnodes */
	LOG_BNODE_DEL,		/* Log of deleting bnode index */
	LOG_BNODE_ADJUST,	/* Log of bnode index ->key adjust */
	LOG_BNODE_FREE,		/* Log of freeing bnode */
	LOG_ORPHAN_ADD,		/* Log of adding orphan inode */
	LOG_ORPHAN_DEL,		/* Log of deleting orphan inode */
	LOG_FREEBLOCKS,		/* Log of freeblocks in bitmap on unify */
	LOG_UNIFY,		/* Log of marking unify */
	LOG_DELTA,		/* just for debugging */
	LOG_TYPES
};

/* For debugging, MAX_ATTRS is smaller than 31, so present never be -1 */
#define TUX3_INVALID_PRESENT		(-1U)

/* Inode attributes data */
struct tux3_iattr_data {
	unsigned	present;

	umode_t		i_mode;
	uid_t		i_uid;
	gid_t		i_gid;
	unsigned int	i_nlink;
	dev_t		i_rdev;
	loff_t		i_size;
//	struct timespec	i_atime;
	struct timespec	i_mtime;
	struct timespec	i_ctime;
	u64		i_version;
};

/* Per-delta data structure for inode */
struct inode_delta_dirty {
	struct list_head dirty_buffers;	/* list for dirty buffers */
	struct list_head dirty_holes;	/* list for hole extents */
	struct list_head dirty_list;	/* link for dirty inode list */

	/* Forked data storage */
	/* FIXME: we don't need this for frontend delta. We would want
	 * to allocate dynamically */
	struct tux3_iattr_data idata;
};

struct xcache;
struct tux3_inode {
	struct btree btree;
	inum_t inum;			/* Inode number */
	struct xcache *xcache;		/* Extended attribute cache */
	struct list_head alloc_list;	/* link for deferred inum allocation */
	struct list_head orphan_list;	/* link for orphan inode list */

	/* FIXME: we can use RCU for hole_extents? */
	spinlock_t hole_extents_lock;	/* lock for hole_extents */
	struct list_head hole_extents;	/* hole extents list */

	struct rw_semaphore truncate_lock; /* lock for truncate and mmap */
	spinlock_t lock;		/* lock for inode metadata */
	/* Per-delta dirty data for inode */
	unsigned flags;			/* flags for inode state */
	unsigned present;		/* Attributes decoded from or
					 * to be encoded to itree */
	struct inode_delta_dirty i_ddc[TUX3_MAX_DELTA];
#ifdef __KERNEL__
	int (*io)(int rw, struct bufvec *bufvec);
#endif
	/* Generic inode */
	struct inode vfs_inode;
};

static inline struct tux3_inode *tux_inode(struct inode *inode)
{
	return container_of(inode, struct tux3_inode, vfs_inode);
}

static inline struct inode *btree_inode(struct btree *btree)
{
	return &container_of(btree, struct tux3_inode, btree)->vfs_inode;
}
#ifdef __KERNEL__
static inline struct sb *tux_sb(struct super_block *sb)
{
	return sb->s_fs_info;
}

static inline struct super_block *vfs_sb(struct sb *sb)
{
	return sb->vfs_sb;
}

typedef struct address_space map_t;

static inline map_t *mapping(struct inode *inode)
{
	return inode->i_mapping;
}

static inline struct block_device *sb_dev(struct sb *sb)
{
	return sb->vfs_sb->s_bdev;
}
#else /* !__KERNEL__ */
static inline struct sb *tux_sb(struct sb *sb)
{
	return sb;
}

static inline struct sb *vfs_sb(struct sb *sb)
{
	return sb;
}

static inline map_t *mapping(struct inode *inode)
{
	return inode->map;
}

static inline struct dev *sb_dev(struct sb *sb)
{
	return sb->dev;
}
#endif /* !__KERNEL__ */

/* Get delta from free running counter */
static inline unsigned tux3_delta(unsigned delta)
{
	return delta & (TUX3_MAX_DELTA - 1);
}

/* Get per-delta dirty data control for sb */
static inline struct sb_delta_dirty *tux3_sb_ddc(struct sb *sb, unsigned delta)
{
	return &sb->s_ddc[tux3_delta(delta)];
}

/* Get per-delta dirty data control for inode */
static inline struct inode_delta_dirty *tux3_inode_ddc(struct inode *inode,
						       unsigned delta)
{
	return &tux_inode(inode)->i_ddc[tux3_delta(delta)];
}

static inline struct tux3_inode *i_ddc_to_inode(struct inode_delta_dirty *i_ddc,
						unsigned delta)
{
	return container_of(i_ddc, struct tux3_inode, i_ddc[tux3_delta(delta)]);
}

/* Get per-delta dirty buffers list from inode */
static inline struct list_head *tux3_dirty_buffers(struct inode *inode,
						   unsigned delta)
{
	return &tux3_inode_ddc(inode, delta)->dirty_buffers;
}

struct tux_iattr {
	kuid_t	uid;
	kgid_t	gid;
	umode_t	mode;
};

static inline struct btree *itree_btree(struct sb *sb)
{
	return &sb->itree;
}

static inline struct btree *otree_btree(struct sb *sb)
{
	return &sb->otree;
}

#define TUX_LINK_MAX (1 << 15) /* arbitrary limit, increase it */

#define TUX_NAME_LEN 255

/* directory entry */
typedef struct {
	__be64 inum;
	__be16 rec_len;
	u8 name_len, type;
	char name[];
	/*
	 * On 64bit arch sizeof(tux_dirent) == 16. We should use
	 * offsetof(tux_dirent, name) instead.
	 */
	/* u32 __pad; */
} tux_dirent;

struct btree_key_range {
	tuxkey_t start;
	unsigned len;
};

enum btree_result {
	BTREE_DO_RETRY = 0,
	BTREE_DO_DIRTY,
	BTREE_DO_SPLIT,
};

struct btree_ops {
	void (*btree_init)(struct btree *btree);
	int (*leaf_init)(struct btree *btree, void *leaf);
	tuxkey_t (*leaf_split)(struct btree *btree, tuxkey_t hint, void *from, void *into);
	/* return value: 1 - modified, 0 - not modified, < 0 - error */
	int (*leaf_chop)(struct btree *btree, tuxkey_t start, u64 len, void *leaf);
	/* return value: 1 - merged, 0 - couldn't merge */
	int (*leaf_merge)(struct btree *btree, void *into, void *from);
	/* return value: < 0 - error, 0 >= - btree_result */
	int (*leaf_pre_write)(struct btree *btree, tuxkey_t key_bottom, tuxkey_t key_limit, void *leaf, struct btree_key_range *key);
	/* return value: < 0 - error, 0 >= - btree_result */
	int (*leaf_write)(struct btree *btree, tuxkey_t key_bottom, tuxkey_t key_limit, void *leaf, struct btree_key_range *key, tuxkey_t *split_hint);
	int (*leaf_read)(struct btree *btree, tuxkey_t key_bottom, tuxkey_t key_limit, void *leaf, struct btree_key_range *key);

	void *private_ops;

	/*
	 * for debugging
	 */
	int (*leaf_sniff)(struct btree *btree, void *leaf);
	/* return value: 1 - can free, 0 - can't free */
	int (*leaf_can_free)(struct btree *btree, void *leaf);
};

/* Information for replay */
struct replay {
	struct sb *sb;

	/* For orphan.c */
	struct list_head log_orphan_add;   /* To remember LOG_ORPHAN_ADD */
	struct list_head orphan_in_otree; /* Orphan inodes in sb->otree */

	/* For replay.c */
	void *unify_pos;	/* position of unify log in a log block */
	block_t unify_index;	/* index of a log block including unify log */
	block_t blocknrs[];	/* block address of log blocks */
};

/* Does this root has direct extent? */
static inline int has_direct_extent(struct btree *btree)
{
	return btree->root.direct;
}

extern struct root no_root;
/* Does this root has bnode/leaf? */
static inline int has_root(struct btree *btree)
{
	return !has_direct_extent(btree) && btree->root.depth > 0;
}

/* Doesn't have both btree or direct extent? */
static inline int has_no_root(struct btree *btree)
{
	return btree->root.depth == 0;
}

/* Redirect ptr which is pointing data of src from src to dst */
static inline void *ptr_redirect(void *ptr, void *src, void *dst)
{
	if (ptr) {
		assert(ptr >= src);
		return dst + (ptr - src);
	}
	return NULL;
}

#ifdef __KERNEL__
static inline struct timespec gettime(void)
{
	return current_kernel_time();
}

static inline struct inode *buffer_inode(struct buffer_head *buffer)
{
	return buffer->b_page->mapping->host;
}

/* Get logical index of buffer */
static inline block_t bufindex(struct buffer_head *buffer)
{
	struct page *page = buffer->b_page;
	/* FIXME: maybe we want to remove buffer->b_size */
#if BITS_PER_LONG == 64
	return (page_offset(page) + bh_offset(buffer)) / buffer->b_size;
#else
	const int blockbits = ffs(buffer->b_size) - 1;
	return (page_offset(page) + bh_offset(buffer)) >> blockbits;
#endif
}

/* dir.c */
extern const struct file_operations tux_dir_fops;
extern const struct inode_operations tux_dir_iops;

/* filemap.c */
int tux3_get_block(struct inode *inode, sector_t iblock,
		   struct buffer_head *bh_result, int create);
struct buffer_head *__get_buffer(struct page *page, int offset);
void tux3_try_cancel_dirty_page(struct page *page);
void __tux3_set_page_dirty_account(struct page *page,
				   struct address_space *mapping, int warn);
int tux3_file_mmap(struct file *file, struct vm_area_struct *vma);
extern const struct address_space_operations tux_file_aops;
extern const struct address_space_operations tux_symlink_aops;
extern const struct address_space_operations tux_blk_aops;
extern const struct address_space_operations tux_vol_aops;

/* inode.c */
int tux3_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
int tux3_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
int tux3_setattr(struct dentry *dentry, struct iattr *iattr);

/* symlink.c */
extern const struct inode_operations tux_symlink_iops;

/* utility.c */
int vecio(int rw, struct block_device *dev, loff_t offset, unsigned vecs,
	  struct bio_vec *vec, bio_end_io_t endio, void *info);
int syncio(int rw, struct block_device *dev, loff_t offset, unsigned vecs,
	   struct bio_vec *vec);
int devio(int rw, struct block_device *dev, loff_t offset, void *data,
	  unsigned len);
int blockio(int rw, struct sb *sb, struct buffer_head *buffer, block_t block);
int blockio_vec(int rw, struct bufvec *bufvec, block_t block, unsigned count);

#define tux3_msg(sb, fmt, ...)						\
	__tux3_msg(sb, KERN_INFO, "", fmt, ##__VA_ARGS__)
#define __tux3_err(sb, func, line, fmt, ...)				\
	__tux3_msg(sb, KERN_ERR, " error",				\
		   "%s:%d: " fmt, func, line, ##__VA_ARGS__)
#define tux3_err(sb, fmt, ...)						\
	__tux3_err(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
#define tux3_warn(sb, fmt, ...)					\
	__tux3_msg(sb, KERN_WARNING, " warning", fmt, ##__VA_ARGS__)

/* temporary hack for buffer */
struct buffer_head *peekblk(struct address_space *mapping, block_t iblock);
struct buffer_head *blockread(struct address_space *mapping, block_t iblock);
struct buffer_head *blockget(struct address_space *mapping, block_t iblock);
#endif /* !__KERNEL__ */

/* balloc.c */
void countmap_put(struct countmap_pin *countmap_pin);
int countmap_used(struct sb *sb, block_t group);
void bitmap_dump(struct inode *inode, block_t start, block_t count);
int balloc_find_range(struct sb *sb,
	struct block_segment *seg, int maxsegs, int *segs,
	block_t start, block_t range, unsigned *blocks);
int balloc_find(struct sb *sb,
	struct block_segment *seg, int maxsegs, int *segs,
	unsigned *blocks);
int balloc_use(struct sb *sb, struct block_segment *seg, int segs);
int balloc_segs(struct sb *sb,
	struct block_segment *seg, int maxsegs, int *segs,
	unsigned *blocks);
block_t balloc_one(struct sb *sb);
int bfree_segs(struct sb *sb, struct block_segment *seg, int segs);
int bfree(struct sb *sb, block_t start, unsigned blocks);
int replay_update_bitmap(struct replay *rp, block_t start, unsigned blocks, int set);

/* btree.c */
unsigned calc_entries_per_node(unsigned blocksize);
struct buffer_head *cursor_leafbuf(struct cursor *cursor);
void release_cursor(struct cursor *cursor);
struct cursor *alloc_cursor(struct btree *btree, int);
void free_cursor(struct cursor *cursor);

void init_btree(struct btree *btree, struct sb *sb, struct root root, struct btree_ops *ops);
int btree_alloc_empty(struct btree *btree);
int btree_free_empty(struct btree *btree);
struct buffer_head *new_leaf(struct btree *btree);
tuxkey_t cursor_next_key(struct cursor *cursor);
tuxkey_t cursor_this_key(struct cursor *cursor);
int btree_probe(struct cursor *cursor, tuxkey_t key);
typedef int (*btree_traverse_func_t)(struct btree *btree, tuxkey_t key_bottom,
				     tuxkey_t key_limit, void *leaf,
				     tuxkey_t key, u64 len, void *data);
int btree_traverse(struct cursor *cursor, tuxkey_t key, u64 len,
		   btree_traverse_func_t func, void *data);
int btree_chop(struct btree *btree, tuxkey_t start, u64 len);
int btree_insert_leaf(struct cursor *cursor, tuxkey_t key, struct buffer_head *leafbuf);
void *btree_expand(struct cursor *cursor, tuxkey_t key, unsigned newsize);
int noop_pre_write(struct btree *btree, tuxkey_t key_bottom, tuxkey_t key_limit,
		   void *leaf, struct btree_key_range *key);
int btree_write(struct cursor *cursor, struct btree_key_range *key);
int btree_read(struct cursor *cursor, struct btree_key_range *key);
int cursor_redirect(struct cursor *cursor);
int replay_bnode_redirect(struct replay *rp, block_t oldblock, block_t newblock);
int replay_bnode_root(struct replay *rp, block_t root, unsigned count,
		      block_t left, block_t right, tuxkey_t rkey);
int replay_bnode_split(struct replay *rp, block_t src, unsigned pos, block_t dst);
int replay_bnode_add(struct replay *rp, block_t parent, block_t child, tuxkey_t key);
int replay_bnode_update(struct replay *rp, block_t parent, block_t child, tuxkey_t key);
int replay_bnode_merge(struct replay *rp, block_t src, block_t dst);
int replay_bnode_del(struct replay *rp, block_t bnode, tuxkey_t key, unsigned count);
int replay_bnode_adjust(struct replay *rp, block_t bnode, tuxkey_t from, tuxkey_t to);

/* commit.c */
int setup_sb(struct sb *sb, struct disksuper *super);
int load_sb(struct sb *sb);
int save_sb(struct sb *sb);
void tux3_start_backend(struct sb *sb);
void tux3_end_backend(void);
int tux3_under_backend(struct sb *sb);
int force_unify(struct sb *sb);
int force_delta(struct sb *sb);
unsigned tux3_get_current_delta(void);
unsigned tux3_inode_delta(struct inode *inode);
void change_begin_atomic(struct sb *sb);
void change_end_atomic(struct sb *sb);
void change_begin_atomic_nested(struct sb *sb, void **ptr);
void change_end_atomic_nested(struct sb *sb, void *ptr);
void change_begin(struct sb *sb);
int change_end(struct sb *sb);
void change_begin_if_needed(struct sb *sb, int need_sep);
void change_end_if_needed(struct sb *sb);

/* commit_flusher.c */
#include "commit_flusher.h"

/* dir.c */
void tux_set_entry(struct buffer_head *buffer, tux_dirent *entry,
		   inum_t inum, umode_t mode);
void tux_update_dirent(struct inode *dir, struct buffer_head *buffer,
		       tux_dirent *entry, struct inode *new_inode);
loff_t tux_alloc_entry(struct inode *dir, const char *name, unsigned len,
		       loff_t *size, struct buffer_head **hold);
int tux_create_dirent(struct inode *dir, const struct qstr *qstr,
		      struct inode *inode);
tux_dirent *tux_find_entry(struct inode *dir, const char *name, unsigned len,
			   struct buffer_head **result, loff_t size);
tux_dirent *tux_find_dirent(struct inode *dir, const struct qstr *qstr,
			    struct buffer_head **result);
int tux_delete_entry(struct inode *dir, struct buffer_head *buffer,
		     tux_dirent *entry);
int tux_delete_dirent(struct inode *dir, struct buffer_head *buffer,
		      tux_dirent *entry);
int tux_readdir(struct file *file, struct dir_context *ctx);
int tux_dir_is_empty(struct inode *dir);

/* dleaf.c */
extern struct btree_ops dtree_ops;

/* filemap.c */
int dtree_chop(struct btree *btree, tuxkey_t start, u64 len);
int tux3_filemap_overwrite_io(int rw, struct bufvec *bufvec);
int tux3_filemap_redirect_io(int rw, struct bufvec *bufvec);
int tux3_truncate_partial_block(struct inode *inode, loff_t newsize);
void tux3_truncate_pagecache(struct inode *inode, loff_t newsize);

/* iattr.c */
void dump_attrs(struct inode *inode);
void *encode_kind(void *attrs, unsigned kind, unsigned version);
void *decode_kind(void *attrs, unsigned *kind, unsigned *version);
extern struct ileaf_attr_ops iattr_ops;

/* ileaf.c */
struct ileaf;
void *ileaf_lookup(struct btree *btree, inum_t inum, struct ileaf *leaf, unsigned *result);
int ileaf_find_free(struct btree *btree, tuxkey_t key_bottom,
		    tuxkey_t key_limit, void *leaf,
		    tuxkey_t key, u64 len, void *data);
struct ileaf_enumrate_cb {
	int (*callback)(struct btree *btree, inum_t inum, void *attrs,
			unsigned size, void *data);
	void *data;
};
int ileaf_enumerate(struct btree *btree, tuxkey_t key_bottom,
		    tuxkey_t key_limit, void *leaf,
		    tuxkey_t key, u64 len, void *data);
extern struct btree_ops itree_ops;
extern struct btree_ops otree_ops;

/* inode.c */
void tux3_inode_copy_attrs(struct inode *inode, unsigned delta);
struct inode *tux_new_volmap(struct sb *sb);
struct inode *tux_new_logmap(struct sb *sb);
struct inode *tux_new_inode(struct inode *dir, struct tux_iattr *iattr,
			    dev_t rdev);
struct tux3_idefer_map *tux3_alloc_idefer_map(void);
void tux3_free_idefer_map(struct tux3_idefer_map *map);
int __init tux3_init_idefer_cache(void);
void tux3_destroy_idefer_cache(void);
void del_defer_alloc_inum(struct inode *inode);
void cancel_defer_alloc_inum(struct inode *inode);
int tux_assign_inum(struct inode *inode, inum_t goal);
struct inode *tux_create_specific_inode(struct inode *dir, inum_t inum,
					struct tux_iattr *iattr, dev_t rdev);
struct inode *tux3_iget(struct sb *sb, inum_t inum);
struct inode *tux3_ilookup_nowait(struct sb *sb, inum_t inum);
struct inode *tux3_ilookup(struct sb *sb, inum_t inum);
int tux3_save_inode(struct inode *inode, struct tux3_iattr_data *idata,
		    unsigned delta);
int tux3_purge_inode(struct inode *inode, struct tux3_iattr_data *idata,
		     unsigned delta);
int tux3_drop_inode(struct inode *inode);
void tux3_evict_inode(struct inode *inode);
void iget_if_dirty(struct inode *inode);

/* log.c */
extern unsigned log_size[];
void log_next(struct sb *sb);
void log_drop(struct sb *sb);
void log_finish(struct sb *sb);
void log_finish_cycle(struct sb *sb, int discard);
int tux3_logmap_io(int rw, struct bufvec *bufvec);
void log_balloc(struct sb *sb, block_t block, unsigned count);
void log_bfree(struct sb *sb, block_t block, unsigned count);
void log_bfree_on_unify(struct sb *sb, block_t block, unsigned count);
void log_bfree_relog(struct sb *sb, block_t block, unsigned count);
void log_leaf_redirect(struct sb *sb, block_t oldblock, block_t newblock);
void log_leaf_free(struct sb *sb, block_t leaf);
void log_bnode_redirect(struct sb *sb, block_t oldblock, block_t newblock);
void log_bnode_root(struct sb *sb, block_t root, unsigned count,
		    block_t left, block_t right, tuxkey_t rkey);
void log_bnode_split(struct sb *sb, block_t src, unsigned pos, block_t dst);
void log_bnode_add(struct sb *sb, block_t parent, block_t child, tuxkey_t key);
void log_bnode_update(struct sb *sb, block_t parent, block_t child,
		      tuxkey_t key);
void log_bnode_merge(struct sb *sb, block_t src, block_t dst);
void log_bnode_del(struct sb *sb, block_t node, tuxkey_t key, unsigned count);
void log_bnode_adjust(struct sb *sb, block_t node, tuxkey_t from, tuxkey_t to);
void log_bnode_free(struct sb *sb, block_t bnode);
void log_orphan_add(struct sb *sb, unsigned version, tuxkey_t inum);
void log_orphan_del(struct sb *sb, unsigned version, tuxkey_t inum);
void log_freeblocks(struct sb *sb, block_t freeblocks);
void log_delta(struct sb *sb);
void log_unify(struct sb *sb);

typedef int (*unstash_t)(struct sb *sb, u64 val);
void stash_init(struct stash *stash);
int stash_value(struct stash *stash, u64 value);
int unstash(struct sb *sb, struct stash *defree, unstash_t actor);
int stash_walk(struct sb *sb, struct stash *stash, unstash_t actor);
int defer_bfree(struct sb *sb, struct stash *defree,
		block_t block, unsigned count);
void destroy_defer_bfree(struct stash *defree);

/* orphan.c */
void clean_orphan_list(struct list_head *head);
extern struct ileaf_attr_ops oattr_ops;
int tux3_unify_orphan_add(struct sb *sb, struct list_head *orphan_add);
int tux3_unify_orphan_del(struct sb *sb, struct list_head *orphan_del);
int tux3_make_orphan_add(struct inode *inode);
int tux3_make_orphan_del(struct inode *inode);
int replay_orphan_add(struct replay *rp, unsigned version, inum_t inum);
int replay_orphan_del(struct replay *rp, unsigned version, inum_t inum);
void replay_iput_orphan_inodes(struct sb *sb,
			       struct list_head *orphan_in_otree,
			       int destroy);
int replay_load_orphan_inodes(struct replay *rp);

/* super.c */
struct replay *tux3_init_fs(struct sb *sbi);

/* policy.c */
inum_t policy_inum(struct inode *dir, loff_t where, struct inode *inode);
void policy_inode_init(inum_t *previous);
void policy_inode(struct inode *inode, inum_t *previous);
void policy_extents(struct bufvec *bufvec);

/* replay.c */
struct replay *replay_stage1(struct sb *sb);
int replay_stage2(struct replay *rp);
int replay_stage3(struct replay *rp, int apply);

/* utility.c */
void __printf(4, 5)
__tux3_msg(struct sb *sb, const char *level, const char *prefix,
	   const char *fmt, ...);
void __printf(1, 2)
__tux3_dbg(const char *fmt, ...);
#define tux3_dbg(fmt , ...)						\
	__tux3_dbg("%s:%d: " fmt "\n", __func__, __LINE__, ##__VA_ARGS__)
void __printf(4, 5)
__tux3_fs_error(struct sb *sb, const char *func, unsigned int line,
		const char *fmt, ...);
#define tux3_fs_error(sb, fmt, ...)	 \
	__tux3_fs_error(sb, __func__, __LINE__, fmt , ##__VA_ARGS__)

void hexdump(void *data, unsigned size);
void set_bits(u8 *bitmap, unsigned start, unsigned count);
void clear_bits(u8 *bitmap, unsigned start, unsigned count);
int all_set(u8 *bitmap, unsigned start, unsigned count);
int all_clear(u8 *bitmap, unsigned start, unsigned count);
int bytebits(u8 c);

/* writeback.c */
void tux3_set_inode_no_flush(struct inode *inode);
void tux3_set_inode_always_dirty(struct inode *inode);
void tux3_mark_btree_dirty(struct btree *btree);
void __tux3_mark_inode_dirty(struct inode *inode, int flags);
static inline void tux3_mark_inode_dirty(struct inode *inode)
{
	__tux3_mark_inode_dirty(inode, I_DIRTY);
}
static inline void tux3_mark_inode_dirty_sync(struct inode *inode)
{
	__tux3_mark_inode_dirty(inode, I_DIRTY_SYNC);
}

void tux3_dirty_inode(struct inode *inode, int flags);
void tux3_mark_inode_to_delete(struct inode *inode);
void tux3_iattrdirty(struct inode *inode);
void tux3_xattrdirty(struct inode *inode);
void tux3_xattr_read_and_clear(struct inode *inode);
void tux3_clear_dirty_inode(struct inode *inode);
void __tux3_mark_buffer_dirty(struct buffer_head *buffer, unsigned delta);
void tux3_mark_buffer_dirty(struct buffer_head *buffer);
void tux3_mark_buffer_unify(struct buffer_head *buffer);
void tux3_mark_inode_orphan(struct tux3_inode *tuxnode);
int tux3_inode_is_orphan(struct tux3_inode *tuxnode);
int tux3_flush_inode_internal(struct inode *inode, unsigned delta, int req_flag);
int tux3_flush_inode(struct inode *inode, unsigned delta, int req_flag);
int tux3_flush_inodes(struct sb *sb, unsigned delta);
int tux3_has_dirty_inodes(struct sb *sb, unsigned delta);
void tux3_clear_dirty_inodes(struct sb *sb, unsigned delta);
void tux3_check_destroy_inode_flags(struct inode *inode);

/* xattr.c */
#ifndef ENOATTR
#define ENOATTR ENODATA
#endif

void atable_init_base(struct sb *sb);
int xcache_dump(struct inode *inode);
void free_xcache(struct inode *inode);
int new_xcache(struct inode *inode, unsigned size);
int xcache_remove_all(struct inode *inode);
int get_xattr(struct inode *inode, const char *name, unsigned len,
	      void *data, unsigned size);
int set_xattr(struct inode *inode, const char *name, unsigned len,
	      const void *data, unsigned size, unsigned flags);
int del_xattr(struct inode *inode, const char *name, unsigned len);
int list_xattr(struct inode *inode, char *text, size_t size);
unsigned encode_xsize(struct inode *inode);
void *encode_xattrs(struct inode *inode, void *attrs, unsigned size);
unsigned decode_xsize(struct inode *inode, void *attrs, unsigned size);
void *decode_xattr(struct inode *inode, void *attrs);

static inline struct buffer_head *vol_find_get_block(struct sb *sb, block_t block)
{
	return peekblk(mapping(sb->volmap), block);
}

static inline struct buffer_head *vol_getblk(struct sb *sb, block_t block)
{
	return blockget(mapping(sb->volmap), block);
}

static inline struct buffer_head *vol_bread(struct sb *sb, block_t block)
{
	return blockread(mapping(sb->volmap), block);
}

#include "dirty-buffer.h"	/* remove this after atomic commit */
#endif /* !TUX3_H */