aboutsummaryrefslogtreecommitdiffstats
path: root/design/XFS_Filesystem_Structure/directories.asciidoc
blob: 06d682e56bf19a48be4463b31dd4493e412e27c1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
[[Directories]]
= Directories

[NOTE]
Only v2 directories covered here. v1 directories are obsolete.

[NOTE]
The term ``block'' in this section will refer to directory blocks, not filesystem
blocks unless otherwise specified.

The size of a ``directory block'' is defined by the xref:Superblocks[superblock's]
+sb_dirblklog+ value. The size in bytes = +sb_blocksize+ × 2^sb_dirblklog^.
For example, if +sb_blocksize+ = 4096 and +sb_dirblklog+ = 2, the directory block
size is 16384 bytes. Directory blocks are always allocated in multiples based on
+sb_dirblklog+. Directory blocks cannot be more that 65536 bytes in size.

All directory entries contain the following ``data'':

* The entry's name (counted string consisting of a single byte +namelen+
followed by +name+ consisting of an array of 8-bit chars without a NULL
terminator).

* The entry's absolute xref:Inode_Numbers[inode number], which are
always 64 bits (8 bytes) in size except a special case for shortform
directories.

* An +offset+ or +tag+ used for iterative readdir calls.

* If the +XFS_SB_FEAT_INCOMPAT_FTYPE+ feature flag is set, each
directory entry contains an +ftype+ field that caches the inode's type
to avoid having to perform an inode lookup.

.ftype values
[options="header"]
|=====
| Flag				| Description
| +XFS_DIR3_FT_UNKNOWN+ |
Entry points to an unknown inode type.  This should never appear on disk.
| +XFS_DIR3_FT_REG_FILE+ |
Entry points to a file.
| +XFS_DIR3_FT_DIR+ |
Entry points to another directory.
| +XFS_DIR3_FT_CHRDEV+ |
Entry points to a character device.
| +XFS_DIR3_FT_BLKDEV+ |
Entry points to a block device.
| +XFS_DIR3_FT_FIFO+ |
Entry points to a FIFO.
| +XFS_DIR3_FT_SOCK+ |
Entry points to a socket.
| +XFS_DIR3_FT_SYMLINK+ |
Entry points to a symbolic link.
| +XFS_DIR3_FT_WHT+ |
Entry points to an overlayfs whiteout file.  This (as far as the author
knows) has never appeared on disk.
|=====

All non-shortform directories also contain two additional structures: ``leaves''
and ``freespace indexes''.

* Leaves contain the sorted hashed name value (+xfs_da_hashname()+ in
xfs_da_btree.c) and associated ``address'' which points to the effective offset
into the directory's data structures. Leaves are used to optimise lookup
operations.

* Freespace indexes contain free space/empty entry tracking for quickly finding an
appropriately sized location for new entries. They maintain the largest free
space for each ``data'' block.

A few common types are used for the directory structures:

[source, c]
----
typedef __uint16_t xfs_dir2_data_off_t;
typedef __uint32_t xfs_dir2_dataptr_t;
----


[[Shortform_Directories]]
== Short Form Directories

* Directory entries are stored within the inode.

* The only data stored is the name, inode number, and offset.  No ``leaf'' or
``freespace index'' information is required as an inode can only store a few
entries.

* ``.'' is not stored (as it's in the inode itself), and ``..'' is a dedicated
+parent+ field in the header.

* The number of directories that can be stored in an inode depends on the
xref:On-disk_Inode[inode] size, the number of entries, the length of the entry
names, and extended attribute data.

* Once the number of entries exceeds the space available in the inode, the
format is converted to a xref:Block_Directories[block directory].

* Shortform directory data is packed as tightly as possible on the disk with the
remaining space zeroed:

[source, c]
----
typedef struct xfs_dir2_sf {
     xfs_dir2_sf_hdr_t         hdr;
     xfs_dir2_sf_entry_t       list[1];
} xfs_dir2_sf_t;
----

*hdr*::
Short form directory header.

*list*::
An array of variable-length directory entry records.

[source, c]
----
typedef struct xfs_dir2_sf_hdr {
     __uint8_t                 count;
     __uint8_t                 i8count;
     xfs_dir2_inou_t           parent;
} xfs_dir2_sf_hdr_t;
----

*count*::
Number of directory entries.

*i8count*::
Number of directory entries requiring 64-bit entries, if any inode numbers
require 64-bits.  Zero otherwise.

*parent*::
The absolute inode number of this directory's parent.

[source, c]
----
typedef struct xfs_dir2_sf_entry {
     __uint8_t                 namelen;
     xfs_dir2_sf_off_t         offset;
     __uint8_t                 name[1];
     __uint8_t                 ftype;
     xfs_dir2_inou_t           inumber;
} xfs_dir2_sf_entry_t;
----

*namelen*::
Length of the name, in bytes.

*offset*::
Offset tag used to assist with directory iteration.

*name*::
The name of the directory entry.  The entry is not NULL-terminated.

*ftype*::
The type of the inode.  This is used to avoid reading the inode while iterating
a directory.  The +XFS_SB_VERSION2_FTYPE+ feature must be set, or this field
will not be present.

*inumber*::
The inode number that this entry points to.  The length is either 32 or 64
bits, depending on whether +icount+ or +i8count+, respectively, are set in the
header.

.Short form directory layout
image::images/39.png[]

* Inode numbers are stored using 4 or 8 bytes depending on whether all the inode
numbers for the directory fit in 4 bytes (32 bits) or not. If all inode numbers
fit in 4 bytes, the header's +count+ value specifies the number of entries in
the directory and +i8count+ will be zero. If any inode number exceeds 4 bytes,
all inode numbers will be 8 bytes in size and the header's +i8count+ value
specifies the number of entries requiring larger inodes.  +i4count+ is still
the number of entries.  The following union covers the shortform inode number
structure:

[source, c]
----
typedef struct { __uint8_t i[8]; } xfs_dir2_ino8_t;
typedef struct { __uint8_t i[4]; } xfs_dir2_ino4_t;
typedef union {
     xfs_dir2_ino8_t           i8;
     xfs_dir2_ino4_t           i4;
} xfs_dir2_inou_t;
----

=== xfs_db Short Form Directory Example

A directory is created with 4 files, all inode numbers fitting within 4 bytes:

----
xfs_db> inode <inode#>
xfs_db> p
core.magic = 0x494e
core.mode = 040755
core.version = 1
core.format = 1 (local)
core.nlinkv1 = 2
...
core.size = 94
core.nblocks = 0
core.extsize = 0
core.nextents = 0
...
u.sfdir2.hdr.count = 4
u.sfdir2.hdr.i8count = 0
u.sfdir2.hdr.parent.i4 = 128              /* parent = root inode */
u.sfdir2.list[0].namelen = 15
u.sfdir2.list[0].offset = 0x30
u.sfdir2.list[0].name = "frame000000.tst"
u.sfdir2.list[0].inumber.i4 = 25165953
u.sfdir2.list[1].namelen = 15
u.sfdir2.list[1].offset = 0x50
u.sfdir2.list[1].name = "frame000001.tst"
u.sfdir2.list[1].inumber.i4 = 25165954
u.sfdir2.list[2].namelen = 15
u.sfdir2.list[2].offset = 0x70
u.sfdir2.list[2].name = "frame000002.tst"
u.sfdir2.list[2].inumber.i4 = 25165955
u.sfdir2.list[3].namelen = 15
u.sfdir2.list[3].offset = 0x90
u.sfdir2.list[3].name = "frame000003.tst"
u.sfdir2.list[3].inumber.i4 = 25165956
----

The raw data on disk with the first entry highlighted. The six byte header
precedes the first entry:

[subs="quotes"]
----
xfs_db> type text
xfs_db> p
00: 49 4e 41 ed 01 01 00 02 00 00 00 00 00 00 00 00 INA.............
10: 00 00 00 02 00 00 00 00 00 00 00 00 00 00 00 02 ................
20: 44 ad 3a 83 1d a9 4a d0 44 ad 3a ab 0b c7 a7 d0 D.....J.D.......
30: 44 ad 3a ab 0b c7 a7 d0 00 00 00 00 00 00 00 5e D...............
40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
50: 00 00 00 02 00 00 00 00 00 00 00 00 00 00 00 00 ................
60: ff ff ff ff 04 00 00 00 00 80 *0f 00 30 66 72 61* ............0fra
70: *6d 65 30 30 30 30 30 30 2e 74 73 74 01 80 00 81* me000000.tst....
80: 0f 00 50 66 72 61 6d 65 30 30 30 30 30 31 2e 74 ..Pframe000001.t
90: 73 74 01 80 00 82 0f 00 70 66 72 61 6d 65 30 30 st......pframe00
a0: 30 30 30 32 2e 74 73 74 01 80 00 83 0f 00 90 66 0002.tst........
b0: 72 61 6d 65 30 30 30 30 30 33 2e 74 73 74 01 80 rame000003.tst..
cO: 00 84 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
----

Next, an entry is deleted (frame000001.tst), and any entries after the deleted
entry are moved or compacted to ``cover'' the hole:

----
xfs_db> inode <inode#>
xfs_db> p
core.magic = 0x494e
core.mode = 040755
core.version = 1
core.format = 1 (local)
core.nlinkv1 = 2
...
core.size = 72
core.nblocks = 0
core.extsize = 0
core.nextents = 0
...
u.sfdir2.hdr.count = 3
u.sfdir2.hdr.i8count = 0
u.sfdir2.hdr.parent.i4 = 128
u.sfdir2.list[0].namelen = 15
u.sfdir2.list[0].offset = 0x30
u.sfdir2.list[0].name = "frame000000.tst"
u.sfdir2.list[0].inumber.i4 = 25165953
u.sfdir2.list[1].namelen = 15
u.sfdir2.list[1].offset = 0x70
u.sfdir2.list[1].name = "frame000002.tst"
u.sfdir2.list[1].inumber.i4 = 25165955
u.sfdir2.list[2].namelen = 15
u.sfdir2.list[2].offset = 0x90
u.sfdir2.list[2].name = "frame000003.tst"
u.sfdir2.list[2].inumber.i4 = 25165956
----

Raw disk data, the space beyond the shortform entries is invalid and could be non-zero:

----
xfs_db> type text
xfs_db> p
00: 49  4e 41 ed 01 01 00 02 00 00 00 00 00 00 00 00 INA.............
10: 00  00 00 02 00 00 00 00 00 00 00 00 00 00 00 03 ................
20: 44  b2 45 a2 09 fd e4 50 44 b2 45 a3 12 ee b5 d0 D.E....PD.E.....
30: 44  b2 45 a3 12 ee b5 d0 00 00 00 00 00 00 00 48 D.E............H
40: 00  00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
50: 00  00 00 02 00 00 00 00 00 00 00 00 00 00 00 00 ................
60: ff  ff ff ff 03 00 00 00 00 80 0f 00 30 66 72 61 ............0fra
70: 6d  65 30 30 30 30 30 30 2e 74 73 74 01 80 00 81 me000000.tst....
80: 0f  00 70 66 72 61 6d 65 30 30 30 30 30 32 2e 74 ..pframe000002.t
90: 73  74 01 80 00 83 0f 00 90 66 72 61 6d 65 30 30 st.......frame00
a0: 30  30 30 33 2e 74 73 74 01 80 00 84 0f 00 90 66 0003.tst.......f
b0: 72  61 6d 65 30 30 30 30 30 33 2e 74 73 74 01 80 rame000003.tst..
c0: 00  84 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
----

This is an example of mixed 4-byte and 8-byte inodes in a directory:

----
xfs_db> inode 1024
xfs_db> p
core.magic = 0x494e
core.mode = 040755
core.version = 3
core.format = 1 (local)
core.nlinkv2 = 9
...
core.size = 125
core.nblocks = 0
core.extsize = 0
core.nextents = 0
...
u3.sfdir3.hdr.count = 7
u3.sfdir3.hdr.i8count = 4
u3.sfdir3.hdr.parent.i8 = 1024
u3.sfdir3.list[0].namelen = 3
u3.sfdir3.list[0].offset = 0x60
u3.sfdir3.list[0].name = "git"
u3.sfdir3.list[0].inumber.i8 = 1027
u3.sfdir3.list[0].filetype = 2
u3.sfdir3.list[1].namelen = 4
u3.sfdir3.list[1].offset = 0x70
u3.sfdir3.list[1].name = "home"
u3.sfdir3.list[1].inumber.i8 = 13422826546
u3.sfdir3.list[1].filetype = 2
u3.sfdir3.list[2].namelen = 10
u3.sfdir3.list[2].offset = 0x80
u3.sfdir3.list[2].name = "mike"
u3.sfdir3.list[2].inumber.i8 = 4299308032
u3.sfdir3.list[2].filetype = 2
u3.sfdir3.list[3].namelen = 3
u3.sfdir3.list[3].offset = 0x98
u3.sfdir3.list[3].name = "mtr"
u3.sfdir3.list[3].inumber.i8 = 13433252916
u3.sfdir3.list[3].filetype = 2
u3.sfdir3.list[4].namelen = 3
u3.sfdir3.list[4].offset = 0xa8
u3.sfdir3.list[4].name = "vms"
u3.sfdir3.list[4].inumber.i8 = 16647516355
u3.sfdir3.list[4].filetype = 2
u3.sfdir3.list[5].namelen = 5
u3.sfdir3.list[5].offset = 0xb8
u3.sfdir3.list[5].name = "rsync"
u3.sfdir3.list[5].inumber.i8 = 3494912
u3.sfdir3.list[5].filetype = 2
u3.sfdir3.list[6].namelen = 3
u3.sfdir3.list[6].offset = 0xd0
u3.sfdir3.list[6].name = "tmp"
u3.sfdir3.list[6].inumber.i8 = 1593379
u3.sfdir3.list[6].filetype = 2
----

[[Block_Directories]]
== Block Directories

When the shortform directory space exceeds the space in an inode, the directory
data is moved into a new single directory block outside the inode. The inode's
format is changed from ``local'' to ``extent'' Following is a list of points about
block directories.

* All directory data is stored within the one directory block, including ``.'' and
``..'' entries which are mandatory.

* The block also contains ``leaf'' and ``freespace index'' information.

* The location of the block is defined by the inode's in-core
xref:Extent_List[extent list]: the +di_u.u_bmx[0]+ value. The file offset in
the extent must always be zero and the +length+ = (directory block size /
filesystem block size). The block number points to the filesystem block
containing the directory data.

* Block directory data is stored in the following structures:

[source, c]
----
#define XFS_DIR2_DATA_FD_COUNT 3
typedef struct xfs_dir2_block {
     xfs_dir2_data_hdr_t        hdr;
     xfs_dir2_data_union_t      u[1];
     xfs_dir2_leaf_entry_t      leaf[1];
     xfs_dir2_block_tail_t      tail;
} xfs_dir2_block_t;
----

*hdr*::
Directory block header.  On a v5 filesystem this is +xfs_dir3_data_hdr_t+.

*u*::
Union of directory and unused entries.

*leaf*::
Hash values of the entries in this block.

*tail*::
Bookkeeping for the leaf entries.

[source, c]
----
typedef struct xfs_dir2_data_hdr {
     __uint32_t                 magic;
     xfs_dir2_data_free_t       bestfree[XFS_DIR2_DATA_FD_COUNT];
} xfs_dir2_data_hdr_t;
----

*magic*::
Magic number for this directory block.

*bestfree*::
An array pointing to free regions in the directory block.

On a v5 filesystem, directory and attribute blocks are formatted with v3
headers, which contain extra data:

[source, c]
----
struct xfs_dir3_blk_hdr {
     __be32                     magic;
     __be32                     crc;
     __be64                     blkno;
     __be64                     lsn;
     uuid_t                     uuid;
     __be64                     owner;
};
----

*magic*::
Magic number for this directory block.

*crc*::
Checksum of the directory block.

*blkno*::
Block number of this directory block.

*lsn*::
Log sequence number of the last write to this block.

*uuid*::
The UUID of this block, which must match either +sb_uuid+ or +sb_meta_uuid+
depending on which features are set.

*owner*::
The inode number that this directory block belongs to.

[source, c]
----
struct xfs_dir3_data_hdr {
     struct xfs_dir3_blk_hdr    hdr;
     xfs_dir2_data_free_t       best_free[XFS_DIR2_DATA_FD_COUNT];
     __be32                     pad;
};
----

*hdr*::
The v5 directory/attribute block header.

*best_free*::
An array pointing to free regions in the directory block.

*pad*::
Padding to maintain a 64-bit alignment.

Within the block, data structures are as follows:

[source, c]
-----
typedef struct xfs_dir2_data_free {
     xfs_dir2_data_off_t        offset;
     xfs_dir2_data_off_t        length;
} xfs_dir2_data_free_t;
----

*offset*::
Block offset of a free block, in bytes.

*length*::
Length of the free block, in bytes.

Space inside the directory block can be used for directory entries or unused
entries.  This is signified via a union of the two types:

[source, c]
-----
typedef union {
     xfs_dir2_data_entry_t      entry;
     xfs_dir2_data_unused_t     unused;
} xfs_dir2_data_union_t;
----

*entry*::
A directory entry.

*unused*::
An unused entry.

[source, c]
-----
typedef struct xfs_dir2_data_entry {
     xfs_ino_t                  inumber;
     __uint8_t                  namelen;
     __uint8_t                  name[1];
     __uint8_t                  ftype;
     xfs_dir2_data_off_t        tag;
} xfs_dir2_data_entry_t;
----

*inumber*::
The inode number that this entry points to.

*namelen*::
Length of the name, in bytes.

*name*::
The name associated with this entry.

*ftype*::
The type of the inode.  This is used to avoid reading the inode while iterating
a directory.  The +XFS_SB_VERSION2_FTYPE+ feature must be set, or this field
will not be present.

*tag*::
Starting offset of the entry, in bytes.  This is used for directory iteration.

[source, c]
-----
typedef struct xfs_dir2_data_unused {
     __uint16_t                 freetag;  /* 0xffff */
     xfs_dir2_data_off_t        length;
     xfs_dir2_data_off_t        tag;
} xfs_dir2_data_unused_t;
----

*freetag*::
Magic number signifying that this is an unused entry.  Must be 0xFFFF.

*length*::
Length of this unused entry, in bytes.

*tag*::
Starting offset of the entry, in bytes.

[source, c]
-----
typedef struct xfs_dir2_leaf_entry {
     xfs_dahash_t               hashval;
     xfs_dir2_dataptr_t         address;
} xfs_dir2_leaf_entry_t;
----

*hashval*::
Hash value of the name of the directory entry.  This is used to speed up entry
lookups.

*address*::
Block offset of the entry, in eight byte units.

[source, c]
-----
typedef struct xfs_dir2_block_tail {
     __uint32_t                 count;
     __uint32_t                 stale;
} xfs_dir2_block_tail_t;
----

*count*::
Number of leaf entries.

*stale*::
Number of free leaf entries.

Following is a diagram of how these pieces fit together for a block directory.

.Block directory layout
image::images/43.png[]

* The magic number in the header is ``XD2B'' (0x58443242), or ``XDB3'' (0x58444233)
on a v5 filesystem.

* The +tag+ in the +xfs_dir2_data_entry_t+ structure stores its offset from the
start of the block.

* The start of a free space region is marked with the +xfs_dir2_data_unused_t+
structure where the +freetag+ is +0xffff+. The +freetag+ and +length+ overwrites
the +inumber+ for an entry. The +tag+ is located at +length - sizeof(tag)+ from
the start of the +unused+ entry on-disk.

* The +bestfree+ array in the header points to as many as three of the largest
spaces of free space within the block for storing new entries sorted by largest
to third largest. If there are less than 3 empty regions, the remaining
+bestfree+ elements are zeroed. The +offset+ specifies the offset from the start
of the block in bytes, and the +length+ specifies the size of the free space in
bytes. The location each points to must contain the above
+xfs_dir2_data_unused_t+ structure. As a block cannot exceed 64KB in size, each
is a 16-bit value. +bestfree+ is used to optimise the time required to locate
space to create an entry. It saves scanning through the block to find a location
suitable for every entry created.

* The +tail+ structure specifies the number of elements in the +leaf+ array and
the number of +stale+ entries in the array. The +tail+ is always located at the
end of the block. The +leaf+ data immediately precedes the +tail+ structure.

* The +leaf+ array, which grows from the end of the block just before the +tail+
structure, contains an array of hash/address pairs for quickly looking up a name
by a hash value. Hash values are covered by the introduction to directories. The
+address+ on-disk is the offset into the block divided by 8
(+XFS_DIR2_DATA_ALIGN+). Hash/address pairs are stored on disk to optimise
lookup speed for large directories. If they were not stored, the hashes would
have to be calculated for all entries each time a lookup occurs in a directory.


=== xfs_db Block Directory Example

A directory is created with 8 entries, directory block size = filesystem block size:

----
xfs_db> sb 0
xfs_db> p
magicnum = 0x58465342
blocksize = 4096
...
dirblklog = 0
...
xfs_db> inode <inode#>
xfs_db> p
core.magic = 0x494e
core.mode = 040755
core.version = 1
core.format = 2 (extents)
core.nlinkv1 = 2
...
core.size = 4096
core.nblocks = 1
core.extsize = 0
core.nextents = 1
...
u.bmx[0] = [startoff,startblock,blockcount,extentflag] 0:[0,2097164,1,0]
----

Go to the ``startblock'' and show the raw disk data:

----
xfs_db> dblock 0
xfs_db> type text
xfs_db> p
000: 58 44 32 42  01 30 0e 78 00 00 00 00 00 00 00 00 XD2B.0.x........
010: 00 00 00 00  02 00 00 80 01 2e 00 00 00 00 00 10 ................
020: 00 00 00 00  00 00 00 80 02 2e 2e 00 00 00 00 20 ................
030: 00 00 00 00  02 00 00 81 0f 66 72 61 6d 65 30 30 .........frame00
040: 30 30 30 30  2e 74 73 74 80 8e 59 00 00 00 00 30 0000.tst..Y....0
050: 00 00 00 00  02 00 00 82 0f 66 72 61 6d 65 30 30 .........frame00
060: 30 30 30 31  2e 74 73 74 d0 ca 5c 00 00 00 00 50 0001.tst.......P
070: 00 00 00 00  02 00 00 83 0f 66 72 61 6d 65 30 30 .........frame00
080: 30 30 30 32  2e 74 73 74 00 00 00 00 00 00 00 70 0002.tst.......p
090: 00 00 00 00  02 00 00 84 0f 66 72 61 6d 65 30 30 .........frame00
0a0: 30 30 30 33  2e 74 73 74 00 00 00 00 00 00 00 90 0003.tst........
0b0: 00 00 00 00  02 00 00 85 0f 66 72 61 6d 65 30 30 .........frame00
0c0: 30 30 30 34 2e 74 73 74 00 00 00 00 00 00 00 b0 0004.tst........
0d0: 00 00 00 00 02 00 00 86 0f 66 72 61 6d 65 30 30 .........frame00
0e0: 30 30 30 35 2e 74 73 74 00 00 00 00 00 00 00 d0 0005.tst........
0f0: 00 00 00 00 02 00 00 87 0f 66 72 61 6d 65 30 30 .........frame00
100: 30 30 30 36 2e 74 73 74 00 00 00 00 00 00 00 f0 0006.tst........
110: 00 00 00 00 02 00 00 88 0f 66 72 61 6d 65 30 30 .........frame00
120: 30 30 30 37 2e 74 73 74 00 00 00 00 00 00 01 10 0007.tst........
130: ff ff 0e 78 00 00 00 00 00 00 00 00 00 00 00 00 ...x............
----

The ``leaf'' and ``tail'' structures are stored at the end of the block, so as the
directory grows, the middle is filled in:

----
fa0: 00 00 00 00 00 00 01 30 00 00 00 2e 00 00 00 02 .......0........
fb0: 00 00 17 2e 00 00 00 04 83 a0 40 b4 00 00 00 0e ................
fc0: 93 a0 40 b4 00 00 00 12 a3 a0 40 b4 00 00 00 06 ................
fd0: b3 a0 40 b4 00 00 00 0a c3 a0 40 b4 00 00 00 1e ................
fe0: d3 a0 40 b4 00 00 00 22 e3 a0 40 b4 00 00 00 16 ................
ff0: f3 a0 40 b4 00 00 00 1a 00 00 00 0a 00 00 00 00 ................
----

In a readable format:

----
xfs_db> type dir2
xfs_db> p
bhdr.magic = 0x58443242
bhdr.bestfree[0].offset = 0x130
bhdr.bestfree[0].length = 0xe78
bhdr.bestfree[1].offset = 0
bhdr.bestfree[1].length = 0
bhdr.bestfree[2].offset = 0
bhdr.bestfree[2].length = 0
bu[0].inumber = 33554560
bu[0].namelen = 1
bu[0].name = "."
bu[0].tag = 0x10
bu[1].inumber = 128
bu[1].namelen = 2
bu[1].name = ".."
bu[1].tag = 0x20
bu[2].inumber = 33554561
bu[2].namelen = 15
bu[2].name = "frame000000.tst"
bu[2].tag = 0x30
bu[3].inumber = 33554562
bu[3].namelen = 15
bu[3].name = "frame000001.tst"
bu[3].tag = 0x50
...
bu[8].inumber = 33554567
bu[8].namelen = 15
bu[8].name = "frame000006.tst"
bu[8].tag = 0xf0
bu[9].inumber = 33554568
bu[9].namelen = 15
bu[9].name = "frame000007.tst"
bu[9].tag = 0x110
bu[10].freetag = 0xffff
bu[10].length = 0xe78
bu[10].tag = 0x130
bleaf[0].hashval = 0x2e
bleaf[0].address = 0x2
bleaf[1].hashval = 0x172e
bleaf[1].address = 0x4
bleaf[2].hashval = 0x83a040b4
bleaf[2].address = 0xe
...
bleaf[8].hashval = 0xe3a040b4
bleaf[8].address = 0x16
bleaf[9].hashval = 0xf3a040b4
bleaf[9].address = 0x1a
btail.count = 10
btail.stale = 0
----

[NOTE]
For block directories, all xfs_db fields are preceded with ``b''.

For a simple lookup example, the hash of frame000000.tst is 0xb3a040b4. Looking
up that value, we get an address of 0x6. Multiply that by 8, it becomes offset
0x30 and the inode at that point is 33554561. 

When we remove an entry from the middle (frame000004.tst), we can see how the
freespace details are adjusted:

----
bhdr.magic = 0x58443242
bhdr.bestfree[0].offset = 0x130
bhdr.bestfree[0].length = 0xe78
bhdr.bestfree[1].offset = 0xb0
bhdr.bestfree[1].length = 0x20
bhdr.bestfree[2].offset = 0
bhdr.bestfree[2].length = 0
...
bu[5].inumber = 33554564
bu[5].namelen = 15
bu[5].name = "frame000003.tst"
bu[5].tag = 0x90
bu[6].freetag = 0xffff
bu[6].length = 0x20
bu[6].tag = 0xb0
bu[7].inumber = 33554566
bu[7].namelen = 15
bu[7].name = "frame000005.tst"
bu[7].tag = 0xd0
...
bleaf[7].hashval = 0xd3a040b4
bleaf[7].address = 0x22
bleaf[8].hashval = 0xe3a040b4
bleaf[8].address = 0
bleaf[9].hashval = 0xf3a040b4
bleaf[9].address = 0x1a
btail.count = 10
btail.stale = 1
----

A new ``bestfree'' value is added for the entry, the start of the entry is marked
as unused with 0xffff (which overwrites the inode number for an actual entry),
and the length of the space. The tag remains intact at the +offset+length -
sizeof(tag)+. The address for the hash is also cleared. The affected areas are
highlighted below:

[subs="quotes"]
----
090: 00 00 00 00 02 00 00 84 0f 66 72 61 6d 65 30 30 ..........frame00
0a0: 30 30 30 33 2e 74 73 74 00 00 00 00 00 00 00 90 0003.tst.........
0b0: *ff ff 00 20* 02 00 00 85 0f 66 72 61 6d 65 30 30 ..........frame00
0c0: 30 30 30 34 2e 74 73 74 00 00 00 00 *00 00 00 b0* 0004.tst.........
0d0: 00 00 00 00 02 00 00 86 0f 66 72 61 6d 65 30 30 ..........frame00
0e0: 30 30 30 35 2e 74 73 74 00 00 00 00 00 00 00 0d 0005.tst.........
...
fb0: 00 00 17 2e 00 00 00 04 83 a0 40 b4 00 00 00 0e .................
fc0: 93 a0 40 b4 00 00 00 12 a3 a0 40 b4 00 00 00 06 .................
fd0: b3 a0 40 b4 00 00 00 0a c3 a0 40 b4 00 00 00 1e .................
fe0: d3 a0 40 b4 00 00 00 22 e3 a0 40 b4 *00 00 00 00* .................
ff0: f3 a0 40 b4 00 00 00 1a 00 00 00 0a *00 00 00 01* .................
----



[[Leaf_Directories]]
== Leaf Directories

Once a Block Directory has filled the block, the directory data is changed into
a new format. It still uses xref:Data_Extents[extents] and the same basic
structures, but the ``data'' and ``leaf'' are split up into their own extents. The
``leaf'' information only occupies one extent. As ``leaf'' information is more
compact than ``data'' information, more than one ``data'' extent is common.

* Block to Leaf conversions retain the existing block for the data entries and
allocate a new block for the leaf and freespace index information.

* As with all directories, data blocks must start at logical offset zero.

* The ``leaf'' block has a special offset defined by +XFS_DIR2_LEAF_OFFSET+.
Currently, this is 32GB and in the extent view, a block offset of
32GB / +sb_blocksize+. On a 4KB block filesystem, this is 0x800000 (8388608
decimal).

* Blocks with directory entries (``data'' extents) have the magic number ``X2D2''
(0x58443244), or ``XDD3'' (0x58444433) on a v5 filesystem.

* The ``data'' extents have a new header (no ``leaf'' data):

[source, c]
----
typedef struct xfs_dir2_data {
     xfs_dir2_data_hdr_t       hdr;
     xfs_dir2_data_union_t     u[1];
} xfs_dir2_data_t;
----

*hdr*::
Data block header.  On a v5 filesystem, this field is +struct xfs_dir3_data_hdr+.

*u*::
Union of directory and unused entries, exactly the same as in a block directory.

// split lists

* The ``leaf'' extent uses the following structures:

[source, c]
----
typedef struct xfs_dir2_leaf {
     xfs_dir2_leaf_hdr_t       hdr;
     xfs_dir2_leaf_entry_t     ents[1];
     xfs_dir2_data_off_t       bests[1];
     xfs_dir2_leaf_tail_t      tail;
} xfs_dir2_leaf_t;
----

*hdr*::
Directory leaf header.  On a v5 filesystem this is +struct
xfs_dir3_leaf_hdr_t+.

*ents*::
Hash values of the entries in this block.

*bests*::
An array pointing to free regions in the directory block.

*tail*::
Bookkeeping for the leaf entries.

[source, c]
----
typedef struct xfs_dir2_leaf_hdr {
     xfs_da_blkinfo_t          info;
     __uint16_t                count;
     __uint16_t                stale;
} xfs_dir2_leaf_hdr_t;
----

*info*::
Leaf btree block header.

*count*::
Number of leaf entries.

*stale*::
Number of stale/zeroed leaf entries.

[source, c]
----
struct xfs_dir3_leaf_hdr {
     struct xfs_da3_blkinfo    info;
     __uint16_t                count;
     __uint16_t                stale;
     __be32                    pad;
};
----

*info*::
Leaf B+tree block header.

*count*::
Number of leaf entries.

*stale*::
Number of stale/zeroed leaf entries.

*pad*::
Padding to maintain alignment rules.

[source, c]
----
typedef struct xfs_dir2_leaf_tail {
     __uint32_t                bestcount;
} xfs_dir2_leaf_tail_t;
----

*bestcount*::
Number of best free entries.

// split lists

* The magic number of the leaf block is +XFS_DIR2_LEAF1_MAGIC+ (0xd2f1); on a
v5 filesystem it is +XFS_DIR3_LEAF1_MAGIC+ (0x3df1).

* The size of the +ents+ array is specified by +hdr.count+.

* The size of the +bests+ array is specified by the +tail.bestcount+, which is
also the number of ``data'' blocks for  the directory. The bests array maintains
each data block's +bestfree[0].length+ value.

.Leaf directory free entry detail
image::images/48.png[]

=== xfs_db Leaf Directory Example

For this example, a directory was created with 256 entries (frame000000.tst to
frame000255.tst).  Some files were deleted (frame00005*, frame00018* and
frame000240.tst) to show free list characteristics.

----
xfs_db> inode <inode#>
xfs_db> p
core.magic = 0x494e
core.mode = 040755
core.version = 1
core.format = 2 (extents)
core.nlinkv1 = 2
...
core.size = 12288
core.nblocks = 4
core.extsize = 0
core.nextents = 3
...
u.bmx[0-2] = [startoff,startblock,blockcount,extentflag]
          0:[0,4718604,1,0]
          1:[1,4718610,2,0]
          2:[8388608,4718605,1,0]
----

As can be seen in this example, three blocks are used for ``data'' in two extents,
and the ``leaf'' extent has a logical offset of 8388608 blocks (32GB).

Examining the first block:

----
xfs_db> dblock 0
xfs_db> type dir2
xfs_db> p
dhdr.magic = 0x58443244
dhdr.bestfree[0].offset = 0x670
dhdr.bestfree[0].length = 0x140
dhdr.bestfree[1].offset = 0xff0
dhdr.bestfree[1].length = 0x10
dhdr.bestfree[2].offset = 0
dhdr.bestfree[2].length = 0
du[0].inumber = 75497600
du[0].namelen = 1
du[0].name = "."
du[0].tag = 0x10
du[1].inumber = 128
du[1].namelen = 2
du[1].name = ".."
du[1].tag = 0x20
du[2].inumber = 75497601
du[2].namelen = 15
du[2].name = "frame000000.tst"
du[2].tag = 0x30
du[3].inumber = 75497602
du[3].namelen = 15
du[3].name = "frame000001.tst"
du[3].tag = 0x50
...
du[51].inumber = 75497650
du[51].namelen = 15
du[51].name = "frame000049.tst"
du[51].tag = 0x650
du[52].freetag = 0xffff
du[52].length = 0x140
du[52].tag = 0x670
du[53].inumber = 75497661
du[53].namelen = 15
du[53].name = "frame000060.tst"
du[53].tag = 0x7b0
...
du[118].inumber = 75497758
du[118].namelen = 15
du[118].name = "frame000125.tst"
du[118].tag = 0xfd0
du[119].freetag = 0xffff
du[119].length = 0x10
du[119].tag = 0xff0
----

[NOTE]
The xfs_db field output is preceded by a ``d'' for ``data''.

The next ``data'' block:

----
xfs_db> dblock 1
xfs_db> type dir2
xfs_db> p
dhdr.magic = 0x58443244
dhdr.bestfree[0].offset = 0x6d0
dhdr.bestfree[0].length = 0x140
dhdr.bestfree[1].offset = 0xe50
dhdr.bestfree[1].length = 0x20
dhdr.bestfree[2].offset = 0xff0
dhdr.bestfree[2].length = 0x10
du[0].inumber = 75497759
du[0].namelen = 15
du[0].name = "frame000126.tst"
du[0].tag = 0x10
...
du[53].inumber = 75497844
du[53].namelen = 15
du[53].name = "frame000179.tst"
du[53].tag = 0x6b0
du[54].freetag = 0xffff
du[54].length = 0x140
du[54].tag = 0x6d0
du[55].inumber = 75497855
du[55].namelen = 15
du[55].name = "frame000190.tst"
du[55].tag = 0x810
...
du[104].inumber = 75497904
du[104].namelen = 15
du[104].name = "frame000239.tst"
du[104].tag = 0xe30
du[105].freetag = 0xffff
du[105].length = 0x20
du[105].tag = 0xe50
du[106].inumber = 75497906
du[106].namelen = 15
du[106].name = "frame000241.tst"
du[106].tag = 0xe70
...
du[117].inumber = 75497917
du[117].namelen = 15
du[117].name = "frame000252.tst"
du[117].tag = 0xfd0
du[118].freetag = 0xffff
du[118].length = 0x10
du[118].tag = 0xff0
----

And the last data block:


----
xfs_db> dblock 2
xfs_db> type dir2
xfs_db> p
dhdr.magic = 0x58443244
dhdr.bestfree[0].offset = 0x70
dhdr.bestfree[0].length = 0xf90
dhdr.bestfree[1].offset = 0
dhdr.bestfree[1].length = 0
dhdr.bestfree[2].offset = 0
dhdr.bestfree[2].length = 0
du[0].inumber = 75497918
du[0].namelen = 15
du[0].name = "frame000253.tst"
du[0].tag = 0x10
du[1].inumber = 75497919
du[1].namelen = 15
du[1].name = "frame000254.tst"
du[1].tag = 0x30
du[2].inumber = 75497920
du[2].namelen = 15
du[2].name = "frame000255.tst"
du[2].tag = 0x50
du[3].freetag = 0xffff
du[3].length = 0xf90
du[3].tag = 0x70
----

Examining the ``leaf'' block (with the fields preceded by an ``l'' for ``leaf''):

----
xfs_db> dblock 8388608
xfs_db> type dir2
xfs_db> p
lhdr.info.forw = 0
lhdr.info.back = 0
lhdr.info.magic = 0xd2f1
lhdr.count = 258
lhdr.stale = 0
lbests[0-2] = 0:0x10 1:0x10 2:0xf90
lents[0].hashval = 0x2e
lents[0].address = 0x2
lents[1].hashval = 0x172e
lents[1].address = 0x4
lents[2].hashval = 0x23a04084
lents[2].address = 0x116
...
lents[257].hashval = 0xf3a048bc
lents[257].address = 0x366
ltail.bestcount = 3
----

Note how the +lbests+ array correspond with the +bestfree[0].length+ values in
the ``data'' blocks:

----
xfs_db> dblock 0
xfs_db> type dir2
xfs_db> p
dhdr.magic = 0x58443244
dhdr.bestfree[0].offset = 0xff0
dhdr.bestfree[0].length = 0x10
...
xfs_db> dblock 1
xfs_db> type dir2
xfs_db> p
dhdr.magic = 0x58443244
dhdr.bestfree[0].offset = 0xff0
dhdr.bestfree[0].length = 0x10
...
xfs_db> dblock 2
xfs_db> type dir2
xfs_db> p
dhdr.magic = 0x58443244
dhdr.bestfree[0].offset = 0x70
dhdr.bestfree[0].length = 0xf90
----

Now after the entries have been deleted:

----
xfs_db> dblock 8388608
xfs_db> type dir2
xfs_db> p
lhdr.info.forw = 0
lhdr.info.back = 0
lhdr.info.magic = 0xd2f1
lhdr.count = 258
lhdr.stale = 21
lbests[0-2] = 0:0x140 1:0x140 2:0xf90
lents[0].hashval = 0x2e
lents[0].address = 0x2
lents[1].hashval = 0x172e
lents[1].address = 0x4
lents[2].hashval = 0x23a04084
lents[2].address = 0x116
...
----

As can be seen, the +lbests+ values have been update to contain each
+hdr.bestfree[0].length+ values. The leaf's +hdr.stale+ value has also been
updated to specify the number of stale entries in the array. The stale entries
have an address of zero.

TODO: Need an example for where new entries get inserted with several large free
spaces.


[[Node_Directories]]
== Node Directories

When the ``leaf'' information fills a block, the extents undergo another
separation. All ``freeindex'' information moves into its own extent. Like Leaf
Directories, the ``leaf'' block maintained the best free space information for
each ``data'' block. This is not possible with more than one leaf.

* The ``data'' blocks stay the same as leaf directories.

* After the ``freeindex'' data moves to its own block, it is possible for the
leaf data to fit within a single leaf block.  This single leaf block has a
magic number of +XFS_DIR2_LEAFN_MAGIC+ (0xd2ff) or on a v5 filesystem,
+XFS_DIR3_LEAFN_MAGIC+ (0x3dff).

* The ``leaf'' blocks eventually change into a B+tree with the generic B+tree
header pointing to directory ``leaves'' as described in
xref:Leaf_Directories[Leaf Directories]. Blocks with leaf data still have the
+LEAFN_MAGIC+ magic number as outlined above.  The top-level tree blocks are
called ``nodes'' and have a magic number of +XFS_DA_NODE_MAGIC+ (0xfebe), or on
a v5 filesystem, +XFS_DA3_NODE_MAGIC+ (0x3ebe).

* Distinguishing between a combined leaf/freeindex block (+LEAF1_MAGIC+), a
leaf-only block (+LEAFN_MAGIC+), and a btree node block (+NODE_MAGIC+) can only
be done by examining the magic number.

* The new ``freeindex'' block(s) only contains the bests for each data block.

* The freeindex block uses the following structures:

[source, c]
----
typedef struct xfs_dir2_free_hdr {
     __uint32_t                magic;
     __int32_t                 firstdb;
     __int32_t                 nvalid;
     __int32_t                 nused;
} xfs_dir2_free_hdr_t;
----

*magic*::
The magic number of the free block, ``XD2F'' (0x0x58443246).

*firstdb*::
The starting directory block number for the bests array.

*nvalid*::
Number of elements in the bests array.

*nused*::
Number of valid elements in the bests array.

[source, c]
----
typedef struct xfs_dir2_free {
     xfs_dir2_free_hdr_t       hdr;
     xfs_dir2_data_off_t       bests[1];
} xfs_dir2_free_t;
----

*hdr*::
Free block header.

*bests*::
An array specifying the best free counts in each directory data block.

// split lists

* On a v5 filesystem, the freeindex block uses the following structures:

[source, c]
----
struct xfs_dir3_free_hdr {
     struct xfs_dir3_blk_hdr   hdr;
     __int32_t                 firstdb;
     __int32_t                 nvalid;
     __int32_t                 nused;
     __int32_t                 pad;
};
----

*hdr*::
v3 directory block header.  The magic number is "XDF3" (0x0x58444633).

*firstdb*::
The starting directory block number for the bests array.

*nvalid*::
Number of elements in the bests array.

*nused*::
Number of valid elements in the bests array.

*pad*::
Padding to maintain alignment.

[source, c]
----
struct xfs_dir3_free {
     xfs_dir3_free_hdr_t       hdr;
     __be16                    bests[1];
};
----

*hdr*::
Free block header.

*bests*::
An array specifying the best free counts in each directory data block.

// split lists

* The location of the leaf blocks can be in any order, the only way to determine
the appropriate is by the node block hash/before values. Given a hash to look up,
you read the node's +btree+ array and first +hashval+ in the array that exceeds
the given hash and it can then be found in the block pointed to by the +before+
value. 

// split lists

* The freeindex's +bests+ array starts from the end of the block and grows to the
start of the block.

* When an data block becomes unused (ie. all entries in it have been deleted), the
block is freed, the data extents contain a hole, and the freeindex's +hdr.nused+
value is decremented and the associated +bests[]+ entry is set to 0xffff. 

* As the first data block always contains ``.'' and ``..'', it's invalid for the
directory to have a hole at the start.

* The freeindex's +hdr.nvalid+ should always be the same as the number of
allocated data directory blocks containing name/inode data and will always be
less than or equal to +hdr.nused+.  The value of +hdr.nused+ should be the same
as the index of the last data directory block plus one (i.e. when the last data
block is freed, +nused+ and +nvalid+ are decremented).

.Node directory layout
image::images/54.png[]

=== xfs_db Node Directory Example

With the node directory examples, we are using a filesystems with 4KB block
size, and a 16KB directory size. The directory has over 2000 entries:

----
xfs_db> sb 0
xfs_db> p
magicnum = 0x58465342
blocksize = 4096
...
dirblklog = 2
...
xfs_db> inode <inode#>
xfs_db> p
core.magic = 0x494e
core.mode = 040755
core.version = 1
core.format = 2 (extents)
...
core.size = 81920
core.nblocks = 36
core.extsize = 0
core.nextents = 8
...
u.bmx[0-7] = [startoff,startblock,blockcount,extentflag] 0:[0,7368,4,0]
1:[4,7408,4,0] 2:[8,7444,4,0] 3:[12,7480,4,0] 4:[16,7520,4,0]
5:[8388608,7396,4,0] 6:[8388612,7524,8,0] 7:[16777216,7516,4,0]
----

As can already be observed, all extents are allocated is multiples of 4 blocks.

Blocks 0 to 19 (16+4-1) are used for directory data blocks. Looking at blocks
16-19, we can seen that it's the same as the single-leaf format, except the
+length+ values are a lot larger to accommodate the increased directory block
size:

----
xfs_db> dblock 16
xfs_db> type dir2
xfs_db> p
dhdr.magic = 0x58443244
dhdr.bestfree[0].offset = 0xb0
dhdr.bestfree[0].length = 0x3f50
dhdr.bestfree[1].offset = 0
dhdr.bestfree[1].length = 0
dhdr.bestfree[2].offset = 0
dhdr.bestfree[2].length = 0
du[0].inumber = 120224
du[0].namelen = 15
du[0].name = "frame002043.tst"
du[0].tag = 0x10
du[1].inumber = 120225
du[1].namelen = 15
du[1].name = "frame002044.tst"
du[1].tag = 0x30
du[2].inumber = 120226
du[2].namelen = 15
du[2].name = "frame002045.tst"
du[2].tag = 0x50
du[3].inumber = 120227
du[3].namelen = 15
du[3].name = "frame002046.tst"
du[3].tag = 0x70
du[4].inumber = 120228
du[4].namelen = 15
du[4].name = "frame002047.tst"
du[4].tag = 0x90
du[5].freetag = 0xffff
du[5].length = 0x3f50
du[5].tag = 0
----

Next, the ``node'' block, the fields are preceded with 'n' for node blocks:

----
xfs_db> dblock 8388608
xfs_db> type dir2
xfs_db> p
nhdr.info.forw = 0
nhdr.info.back = 0
nhdr.info.magic = 0xfebe
nhdr.count = 2
nhdr.level = 1
nbtree[0-1] = [hashval,before] 0:[0xa3a440ac,8388616] 1:[0xf3a440bc,8388612]
----

The two following leaf blocks were allocated as part of the directory's
conversion to node format.  All hashes less than 0xa3a440ac are located at
directory offset 8,388,616, and hashes less than 0xf3a440bc are located at
directory offset 8,388,612.  Hashes greater or equal to 0xf3a440bc don't exist
in this directory.

----
xfs_db> dblock 8388616
xfs_db> type dir2
xfs_db> p
lhdr.info.forw = 8388612
lhdr.info.back = 0
lhdr.info.magic = 0xd2ff
lhdr.count = 1023
lhdr.stale = 0
lents[0].hashval = 0x2e
lents[0].address = 0x2
lents[1].hashval = 0x172e
lents[1].address = 0x4
lents[2].hashval = 0x23a04084
lents[2].address = 0x116
...
lents[1021].hashval = 0xa3a440a4
lents[1021].address = 0x1fa2
lents[1022].hashval = 0xa3a440ac
lents[1022].address = 0x1fca
xfs_db> dblock 8388612
xfs_db> type dir2
xfs_db> p
lhdr.info.forw = 0
lhdr.info.back = 8388616
lhdr.info.magic = 0xd2ff
lhdr.count = 1027
lhdr.stale = 0
lents[0].hashval = 0xa3a440b4
lents[0].address = 0x1f52
lents[1].hashval = 0xa3a440bc
lents[1].address = 0x1f7a
...
lents[1025].hashval = 0xf3a440b4
lents[1025].address = 0x1f66
lents[1026].hashval = 0xf3a440bc
lents[1026].address = 0x1f8e
----

An example lookup using xfs_db:

----
xfs_db> hash frame001845.tst
0xf3a26094
----

Doing a binary search through the array, we get address 0x1ce6, which is offset
0xe730. Each fsblock is 4KB in size (0x1000), so it will be offset 0x730 into
directory offset 14. From the extent map, this will be fsblock 7482:

----
xfs_db> fsblock 7482
xfs_db> type text
xfs_db> p
...
730: 00 00 00 00 00 01 d4 da 0f 66 72 61 6d 65 30 30 .........frame00
740: 31 38 34 35 2e 74 73 74 00 00 00 00 00 00 27 30 1845.tst.......0
----

Looking at the freeindex information (fields with an 'f' tag):

----
xfs_db> fsblock 7516
xfs_db> type dir2
xfs_db> p
fhdr.magic = 0x58443246
fhdr.firstdb = 0
fhdr.nvalid = 5
fhdr.nused = 5
fbests[0-4] = 0:0x10 1:0x10 2:0x10 3:0x10 4:0x3f50
----

Like the Leaf Directory, each of the +fbests+ values correspond to each data
block's +bestfree[0].length+ value. 

The +fbests+ array is highlighted in a raw block dump:

[subs="quotes"]
----
xfs_db> type text
xfs_db> p
000: 58 44 32 46 00 00 00 00 00 00 00 05 00 00 00 05 XD2F............
010: *00 10 00 10 00 10 00 10 3f 50* 00 00 1f 01 ff ff .........P......
----

TODO: Example with a hole in the middle


[[Btree_Directories]]
== B+tree Directories

When the extent map in an inode grows beyond the inode's space, the inode format
is changed to a ``btree''. The inode contains a filesystem block point to the
B+tree extent map for the directory's blocks. The B+tree extents contain the
extent map for the ``data'', ``node'', ``leaf'', and ``freeindex'' information as
described in Node Directories.

Refer to the previous section on B+tree xref:Btree_Extent_List[Data Extents] for
more information on XFS B+tree extents.

The following properties apply to both node and B+tree directories:

* The node/leaf trees can be more than one level deep.

* More than one freeindex block may exist, but this will be quite rare. It would
required hundreds of thousand files with quite long file names (or millions with
shorter names) to get a second freeindex block.

=== xfs_db B+tree Directory Example

A directory has been created with 200,000 entries with each entry being 100
characters long. The filesystem block size and directory block size are 4KB:

----
xfs_db> inode <inode#>
xfs_db> p
core.magic = 0x494e
core.mode = 040755
core.version = 1
core.format = 3 (btree)
...
core.size = 22757376
core.nblocks = 6145
core.extsize = 0
core.nextents = 234
core.naextents = 0
core.forkoff = 0
...
u.bmbt.level = 1
u.bmbt.numrecs = 1
u.bmbt.keys[1] = [startoff] 1:[0]
u.bmbt.ptrs[1] = 1:89
xfs_db> fsblock 89
xfs_db> type bmapbtd
xfs_db> p
magic = 0x424d4150
level = 0
numrecs = 234
leftsib = null
rightsib = null
recs[1-234] = [startoff,startblock,blockcount,extentflag]
   1:[0,53,1,0] 2:[1,55,13,0] 3:[14,69,1,0] 4:[15,72,13,0]
   5:[28,86,2,0] 6:[30,90,21,0] 7:[51,112,1,0] 8:[52,114,11,0]
   ...
   125:[5177,902,15,0] 126:[5192,918,6,0] 127:[5198,524786,358,0]
   128:[8388608,54,1,0] 129:[8388609,70,2,0] 130:[8388611,85,1,0]
   ...
   229:[8389164,917,1,0] 230:[8389165,924,19,0] 231:[8389184,944,9,0]
   232:[16777216,68,1,0] 233:[16777217,7340114,1,0] 234:[16777218,5767362,1,0]
----

We have 128 extents and a total of 5555 blocks being used to store name/inode
pairs. With only about 2000 values that can be stored in the freeindex block, 3
blocks have been allocated for this information. The +firstdb+ field specifies
the starting directory block number for each array:

----
xfs_db> dblock 16777216
xfs_db> type dir2
xfs_db> p
fhdr.magic = 0x58443246
fhdr.firstdb = 0
fhdr.nvalid = 2040
fhdr.nused = 2040
fbests[0-2039] = ...
xfs_db> dblock 16777217
xfs_db> type dir2
xfs_db> p
fhdr.magic = 0x58443246
fhdr.firstdb = 2040
fhdr.nvalid = 2040
fhdr.nused = 2040
fbests[0-2039] = ...
xfs_db> dblock 16777218
xfs_db> type dir2
xfs_db> p
fhdr.magic = 0x58443246
fhdr.firstdb = 4080
fhdr.nvalid = 1476
fhdr.nused = 1476
fbests[0-1475] = ...
----

Looking at the root node in the node block, it's a pretty deep tree:

----
xfs_db> dblock 8388608
xfs_db> type dir2
xfs_db> p
nhdr.info.forw = 0
nhdr.info.back = 0
nhdr.info.magic = 0xfebe
nhdr.count = 2
nhdr.level = 2
nbtree[0-1] = [hashval,before] 0:[0x6bbf6f39,8389121] 1:[0xfbbf7f79,8389120]
xfs_db> dblock 8389121
xfs_db> type dir2
xfs_db> p
nhdr.info.forw = 8389120
nhdr.info.back = 0
nhdr.info.magic = 0xfebe
nhdr.count = 263
nhdr.level = 1
nbtree[0-262] = ... 262:[0x6bbf6f39,8388928]
xfs_db> dblock 8389120
xfs_db> type dir2
xfs_db> p
nhdr.info.forw = 0
nhdr.info.back = 8389121
nhdr.info.magic = 0xfebe
nhdr.count = 319
nhdr.level = 1
nbtree[0-318] = [hashval,before] 0:[0x70b14711,8388919] ...
----

The leaves at each the end of a node always point to the end leaves in adjacent
nodes. Directory block 8388928 has a forward pointer to block 8388919 and block
8388919 has a previous pointer to block 8388928, as highlighted in the
following example:

[subs="quotes"]
----
xfs_db> dblock 8388928
xfs_db> type dir2
xfs_db> p
lhdr.info.forw = *8388919*
lhdr.info.back = 8388937
lhdr.info.magic = 0xd2ff
...

xfs_db> dblock 8388919
xfs_db> type dir2
xfs_db> p
lhdr.info.forw = 8388706
lhdr.info.back = *8388928*
lhdr.info.magic = 0xd2ff
...
----