€•_µ      Œsphinx.addnodes”Œdocument”“”)”}”(Œ	rawsource”Œ ”Œchildren”]”(Œtranslations”ŒLanguagesNode”“”)”}”(hhh]”(h Œpending_xref”“”)”}”(hhh]”Œdocutils.nodes”ŒText”“”ŒChinese (Simplified)”…””}”Œparent”hsbaŒ
attributes”}”(Œids”]”Œclasses”]”Œnames”]”Œdupnames”]”Œbackrefs”]”Œ	refdomain”Œstd”Œreftype”Œdoc”Œ	reftarget”Œ@/translations/zh_CN/filesystems/xfs/xfs-self-describing-metadata”Œmodname”NŒ	classname”NŒrefexplicit”ˆuŒtagname”hhhubh)”}”(hhh]”hŒChinese (Traditional)”…””}”hh2sbah}”(h]”h ]”h"]”h$]”h&]”Œ	refdomain”h)Œreftype”h+Œ	reftarget”Œ@/translations/zh_TW/filesystems/xfs/xfs-self-describing-metadata”Œmodname”NŒ	classname”NŒrefexplicit”ˆuh1hhhubh)”}”(hhh]”hŒItalian”…””}”hhFsbah}”(h]”h ]”h"]”h$]”h&]”Œ	refdomain”h)Œreftype”h+Œ	reftarget”Œ@/translations/it_IT/filesystems/xfs/xfs-self-describing-metadata”Œmodname”NŒ	classname”NŒrefexplicit”ˆuh1hhhubh)”}”(hhh]”hŒJapanese”…””}”hhZsbah}”(h]”h ]”h"]”h$]”h&]”Œ	refdomain”h)Œreftype”h+Œ	reftarget”Œ@/translations/ja_JP/filesystems/xfs/xfs-self-describing-metadata”Œmodname”NŒ	classname”NŒrefexplicit”ˆuh1hhhubh)”}”(hhh]”hŒKorean”…””}”hhnsbah}”(h]”h ]”h"]”h$]”h&]”Œ	refdomain”h)Œreftype”h+Œ	reftarget”Œ@/translations/ko_KR/filesystems/xfs/xfs-self-describing-metadata”Œmodname”NŒ	classname”NŒrefexplicit”ˆuh1hhhubh)”}”(hhh]”hŒSpanish”…””}”hh‚sbah}”(h]”h ]”h"]”h$]”h&]”Œ	refdomain”h)Œreftype”h+Œ	reftarget”Œ@/translations/sp_SP/filesystems/xfs/xfs-self-describing-metadata”Œmodname”NŒ	classname”NŒrefexplicit”ˆuh1hhhubeh}”(h]”h ]”h"]”h$]”h&]”Œcurrent_language”ŒEnglish”uh1h
hhŒ	_document”hŒsource”NŒline”NubhŒcomment”“”)”}”(hŒ SPDX-License-Identifier: GPL-2.0”h]”hŒ SPDX-License-Identifier: GPL-2.0”…””}”hh£sbah}”(h]”h ]”h"]”h$]”h&]”Œ	xml:space”Œpreserve”uh1h¡hhhžhhŸŒZ/var/lib/git/docbuild/linux/Documentation/filesystems/xfs/xfs-self-describing-metadata.rst”h KubhŒtarget”“”)”}”(hŒ!.. _xfs_self_describing_metadata:”h]”h}”(h]”h ]”h"]”h$]”h&]”Œrefid”Œxfs-self-describing-metadata”uh1h´h KhhhžhhŸh³ubhŒsection”“”)”}”(hhh]”(hŒtitle”“”)”}”(hŒXFS Self Describing Metadata”h]”hŒXFS Self Describing Metadata”…””}”(hhÉhžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÇhhÄhžhhŸh³h KubhÃ)”}”(hhh]”(hÈ)”}”(hŒIntroduction”h]”hŒIntroduction”…””}”(hhÚhžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÇhh×hžhhŸh³h K	ubhŒ	paragraph”“”)”}”(hXm  The largest scalability problem facing XFS is not one of algorithmic
scalability, but of verification of the filesystem structure. Scalabilty of the
structures and indexes on disk and the algorithms for iterating them are
adequate for supporting PB scale filesystems with billions of inodes, however it
is this very scalability that causes the verification problem.”h]”hXm  The largest scalability problem facing XFS is not one of algorithmic
scalability, but of verification of the filesystem structure. Scalabilty of the
structures and indexes on disk and the algorithms for iterating them are
adequate for supporting PB scale filesystems with billions of inodes, however it
is this very scalability that causes the verification problem.”…””}”(hhêhžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hèhŸh³h Khh×hžhubhé)”}”(hXÆ  Almost all metadata on XFS is dynamically allocated. The only fixed location
metadata is the allocation group headers (SB, AGF, AGFL and AGI), while all
other metadata structures need to be discovered by walking the filesystem
structure in different ways. While this is already done by userspace tools for
validating and repairing the structure, there are limits to what they can
verify, and this in turn limits the supportable size of an XFS filesystem.”h]”hXÆ  Almost all metadata on XFS is dynamically allocated. The only fixed location
metadata is the allocation group headers (SB, AGF, AGFL and AGI), while all
other metadata structures need to be discovered by walking the filesystem
structure in different ways. While this is already done by userspace tools for
validating and repairing the structure, there are limits to what they can
verify, and this in turn limits the supportable size of an XFS filesystem.”…””}”(hhøhžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hèhŸh³h Khh×hžhubhé)”}”(hXå  For example, it is entirely possible to manually use xfs_db and a bit of
scripting to analyse the structure of a 100TB filesystem when trying to
determine the root cause of a corruption problem, but it is still mainly a
manual task of verifying that things like single bit errors or misplaced writes
weren't the ultimate cause of a corruption event. It may take a few hours to a
few days to perform such forensic analysis, so for at this scale root cause
analysis is entirely possible.”h]”hXç  For example, it is entirely possible to manually use xfs_db and a bit of
scripting to analyse the structure of a 100TB filesystem when trying to
determine the root cause of a corruption problem, but it is still mainly a
manual task of verifying that things like single bit errors or misplaced writes
werenâ€™t the ultimate cause of a corruption event. It may take a few hours to a
few days to perform such forensic analysis, so for at this scale root cause
analysis is entirely possible.”…””}”(hj  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hèhŸh³h Khh×hžhubhé)”}”(hXÒ  However, if we scale the filesystem up to 1PB, we now have 10x as much metadata
to analyse and so that analysis blows out towards weeks/months of forensic work.
Most of the analysis work is slow and tedious, so as the amount of analysis goes
up, the more likely that the cause will be lost in the noise.  Hence the primary
concern for supporting PB scale filesystems is minimising the time and effort
required for basic forensic analysis of the filesystem structure.”h]”hXÒ  However, if we scale the filesystem up to 1PB, we now have 10x as much metadata
to analyse and so that analysis blows out towards weeks/months of forensic work.
Most of the analysis work is slow and tedious, so as the amount of analysis goes
up, the more likely that the cause will be lost in the noise.  Hence the primary
concern for supporting PB scale filesystems is minimising the time and effort
required for basic forensic analysis of the filesystem structure.”…””}”(hj  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hèhŸh³h K hh×hžhubeh}”(h]”Œintroduction”ah ]”h"]”Œintroduction”ah$]”h&]”uh1hÂhhÄhžhhŸh³h K	ubhÃ)”}”(hhh]”(hÈ)”}”(hŒSelf Describing Metadata”h]”hŒSelf Describing Metadata”…””}”(hj-  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÇhj*  hžhhŸh³h K)ubhé)”}”(hXh  One of the problems with the current metadata format is that apart from the
magic number in the metadata block, we have no other way of identifying what it
is supposed to be. We can't even identify if it is the right place. Put simply,
you can't look at a single metadata block in isolation and say "yes, it is
supposed to be there and the contents are valid".”h]”hXp  One of the problems with the current metadata format is that apart from the
magic number in the metadata block, we have no other way of identifying what it
is supposed to be. We canâ€™t even identify if it is the right place. Put simply,
you canâ€™t look at a single metadata block in isolation and say â€œyes, it is
supposed to be there and the contents are validâ€.”…””}”(hj;  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hèhŸh³h K+hj*  hžhubhé)”}”(hXû  Hence most of the time spent on forensic analysis is spent doing basic
verification of metadata values, looking for values that are in range (and hence
not detected by automated verification checks) but are not correct. Finding and
understanding how things like cross linked block lists (e.g. sibling
pointers in a btree end up with loops in them) are the key to understanding what
went wrong, but it is impossible to tell what order the blocks were linked into
each other or written to disk after the fact.”h]”hXû  Hence most of the time spent on forensic analysis is spent doing basic
verification of metadata values, looking for values that are in range (and hence
not detected by automated verification checks) but are not correct. Finding and
understanding how things like cross linked block lists (e.g. sibling
pointers in a btree end up with loops in them) are the key to understanding what
went wrong, but it is impossible to tell what order the blocks were linked into
each other or written to disk after the fact.”…””}”(hjI  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hèhŸh³h K1hj*  hžhubhé)”}”(hXQ  Hence we need to record more information into the metadata to allow us to
quickly determine if the metadata is intact and can be ignored for the purpose
of analysis. We can't protect against every possible type of error, but we can
ensure that common types of errors are easily detectable.  Hence the concept of
self describing metadata.”h]”hXS  Hence we need to record more information into the metadata to allow us to
quickly determine if the metadata is intact and can be ignored for the purpose
of analysis. We canâ€™t protect against every possible type of error, but we can
ensure that common types of errors are easily detectable.  Hence the concept of
self describing metadata.”…””}”(hjW  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hèhŸh³h K9hj*  hžhubhé)”}”(hX  The first, fundamental requirement of self describing metadata is that the
metadata object contains some form of unique identifier in a well known
location. This allows us to identify the expected contents of the block and
hence parse and verify the metadata object. IF we can't independently identify
the type of metadata in the object, then the metadata doesn't describe itself
very well at all!”h]”hX‘  The first, fundamental requirement of self describing metadata is that the
metadata object contains some form of unique identifier in a well known
location. This allows us to identify the expected contents of the block and
hence parse and verify the metadata object. IF we canâ€™t independently identify
the type of metadata in the object, then the metadata doesnâ€™t describe itself
very well at all!”…””}”(hje  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hèhŸh³h K?hj*  hžhubhé)”}”(hXd  Luckily, almost all XFS metadata has magic numbers embedded already - only the
AGFL, remote symlinks and remote attribute blocks do not contain identifying
magic numbers. Hence we can change the on-disk format of all these objects to
add more identifying information and detect this simply by changing the magic
numbers in the metadata objects. That is, if it has the current magic number,
the metadata isn't self identifying. If it contains a new magic number, it is
self identifying and we can do much more expansive automated verification of the
metadata object at runtime, during forensic analysis or repair.”h]”hXf  Luckily, almost all XFS metadata has magic numbers embedded already - only the
AGFL, remote symlinks and remote attribute blocks do not contain identifying
magic numbers. Hence we can change the on-disk format of all these objects to
add more identifying information and detect this simply by changing the magic
numbers in the metadata objects. That is, if it has the current magic number,
the metadata isnâ€™t self identifying. If it contains a new magic number, it is
self identifying and we can do much more expansive automated verification of the
metadata object at runtime, during forensic analysis or repair.”…””}”(hjs  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hèhŸh³h KFhj*  hžhubhé)”}”(hXË  As a primary concern, self describing metadata needs some form of overall
integrity checking. We cannot trust the metadata if we cannot verify that it has
not been changed as a result of external influences. Hence we need some form of
integrity check, and this is done by adding CRC32c validation to the metadata
block. If we can verify the block contains the metadata it was intended to
contain, a large amount of the manual verification work can be skipped.”h]”hXË  As a primary concern, self describing metadata needs some form of overall
integrity checking. We cannot trust the metadata if we cannot verify that it has
not been changed as a result of external influences. Hence we need some form of
integrity check, and this is done by adding CRC32c validation to the metadata
block. If we can verify the block contains the metadata it was intended to
contain, a large amount of the manual verification work can be skipped.”…””}”(hj  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hèhŸh³h KOhj*  hžhubhé)”}”(hXm  CRC32c was selected as metadata cannot be more than 64k in length in XFS and
hence a 32 bit CRC is more than sufficient to detect multi-bit errors in
metadata blocks. CRC32c is also now hardware accelerated on common CPUs so it is
fast. So while CRC32c is not the strongest of possible integrity checks that
could be used, it is more than sufficient for our needs and has relatively
little overhead. Adding support for larger integrity fields and/or algorithms
does really provide any extra value over CRC32c, but it does add a lot of
complexity and so there is no provision for changing the integrity checking
mechanism.”h]”hXm  CRC32c was selected as metadata cannot be more than 64k in length in XFS and
hence a 32 bit CRC is more than sufficient to detect multi-bit errors in
metadata blocks. CRC32c is also now hardware accelerated on common CPUs so it is
fast. So while CRC32c is not the strongest of possible integrity checks that
could be used, it is more than sufficient for our needs and has relatively
little overhead. Adding support for larger integrity fields and/or algorithms
does really provide any extra value over CRC32c, but it does add a lot of
complexity and so there is no provision for changing the integrity checking
mechanism.”…””}”(hj  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hèhŸh³h KVhj*  hžhubhé)”}”(hX  Self describing metadata needs to contain enough information so that the
metadata block can be verified as being in the correct place without needing to
look at any other metadata. This means it needs to contain location information.
Just adding a block number to the metadata is not sufficient to protect against
mis-directed writes - a write might be misdirected to the wrong LUN and so be
written to the "correct block" of the wrong filesystem. Hence location
information must contain a filesystem identifier as well as a block number.”h]”hX  Self describing metadata needs to contain enough information so that the
metadata block can be verified as being in the correct place without needing to
look at any other metadata. This means it needs to contain location information.
Just adding a block number to the metadata is not sufficient to protect against
mis-directed writes - a write might be misdirected to the wrong LUN and so be
written to the â€œcorrect blockâ€ of the wrong filesystem. Hence location
information must contain a filesystem identifier as well as a block number.”…””}”(hj  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hèhŸh³h K`hj*  hžhubhé)”}”(hX9  Another key information point in forensic analysis is knowing who the metadata
block belongs to. We already know the type, the location, that it is valid
and/or corrupted, and how long ago that it was last modified. Knowing the owner
of the block is important as it allows us to find other related metadata to
determine the scope of the corruption. For example, if we have a extent btree
object, we don't know what inode it belongs to and hence have to walk the entire
filesystem to find the owner of the block. Worse, the corruption could mean that
no owner can be found (i.e. it's an orphan block), and so without an owner field
in the metadata we have no idea of the scope of the corruption. If we have an
owner field in the metadata object, we can immediately do top down validation to
determine the scope of the problem.”h]”hX=  Another key information point in forensic analysis is knowing who the metadata
block belongs to. We already know the type, the location, that it is valid
and/or corrupted, and how long ago that it was last modified. Knowing the owner
of the block is important as it allows us to find other related metadata to
determine the scope of the corruption. For example, if we have a extent btree
object, we donâ€™t know what inode it belongs to and hence have to walk the entire
filesystem to find the owner of the block. Worse, the corruption could mean that
no owner can be found (i.e. itâ€™s an orphan block), and so without an owner field
in the metadata we have no idea of the scope of the corruption. If we have an
owner field in the metadata object, we can immediately do top down validation to
determine the scope of the problem.”…””}”(hj«  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hèhŸh³h Khhj*  hžhubhé)”}”(hX°  Different types of metadata have different owner identifiers. For example,
directory, attribute and extent tree blocks are all owned by an inode, while
freespace btree blocks are owned by an allocation group. Hence the size and
contents of the owner field are determined by the type of metadata object we are
looking at.  The owner information can also identify misplaced writes (e.g.
freespace btree block written to the wrong AG).”h]”hX°  Different types of metadata have different owner identifiers. For example,
directory, attribute and extent tree blocks are all owned by an inode, while
freespace btree blocks are owned by an allocation group. Hence the size and
contents of the owner field are determined by the type of metadata object we are
looking at.  The owner information can also identify misplaced writes (e.g.
freespace btree block written to the wrong AG).”…””}”(hj¹  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hèhŸh³h Kthj*  hžhubhé)”}”(hX  Self describing metadata also needs to contain some indication of when it was
written to the filesystem. One of the key information points when doing forensic
analysis is how recently the block was modified. Correlation of set of corrupted
metadata blocks based on modification times is important as it can indicate
whether the corruptions are related, whether there's been multiple corruption
events that lead to the eventual failure, and even whether there are corruptions
present that the run-time verification is not detecting.”h]”hX  Self describing metadata also needs to contain some indication of when it was
written to the filesystem. One of the key information points when doing forensic
analysis is how recently the block was modified. Correlation of set of corrupted
metadata blocks based on modification times is important as it can indicate
whether the corruptions are related, whether thereâ€™s been multiple corruption
events that lead to the eventual failure, and even whether there are corruptions
present that the run-time verification is not detecting.”…””}”(hjÇ  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hèhŸh³h K{hj*  hžhubhé)”}”(hXÌ  For example, we can determine whether a metadata object is supposed to be free
space or still allocated if it is still referenced by its owner by looking at
when the free space btree block that contains the block was last written
compared to when the metadata object itself was last written.  If the free space
block is more recent than the object and the object's owner, then there is a
very good chance that the block should have been removed from the owner.”h]”hXÎ  For example, we can determine whether a metadata object is supposed to be free
space or still allocated if it is still referenced by its owner by looking at
when the free space btree block that contains the block was last written
compared to when the metadata object itself was last written.  If the free space
block is more recent than the object and the objectâ€™s owner, then there is a
very good chance that the block should have been removed from the owner.”…””}”(hjÕ  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hèhŸh³h Kƒhj*  hžhubhé)”}”(hX{  To provide this "written timestamp", each metadata block gets the Log Sequence
Number (LSN) of the most recent transaction it was modified on written into it.
This number will always increase over the life of the filesystem, and the only
thing that resets it is running xfs_repair on the filesystem. Further, by use of
the LSN we can tell if the corrupted metadata all belonged to the same log
checkpoint and hence have some idea of how much modification occurred between
the first and last instance of corrupt metadata on disk and, further, how much
modification occurred between the corruption being written and when it was
detected.”h]”hX  To provide this â€œwritten timestampâ€, each metadata block gets the Log Sequence
Number (LSN) of the most recent transaction it was modified on written into it.
This number will always increase over the life of the filesystem, and the only
thing that resets it is running xfs_repair on the filesystem. Further, by use of
the LSN we can tell if the corrupted metadata all belonged to the same log
checkpoint and hence have some idea of how much modification occurred between
the first and last instance of corrupt metadata on disk and, further, how much
modification occurred between the corruption being written and when it was
detected.”…””}”(hjã  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hèhŸh³h KŠhj*  hžhubeh}”(h]”Œself-describing-metadata”ah ]”h"]”Œself describing metadata”ah$]”h&]”uh1hÂhhÄhžhhŸh³h K)ubhÃ)”}”(hhh]”(hÈ)”}”(hŒRuntime Validation”h]”hŒRuntime Validation”…””}”(hjü  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÇhjù  hžhhŸh³h K•ubhé)”}”(hŒLValidation of self-describing metadata takes place at runtime in two places:”h]”hŒLValidation of self-describing metadata takes place at runtime in two places:”…””}”(hj
  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hèhŸh³h K—hjù  hžhubhŒblock_quote”“”)”}”(hŒ[- immediately after a successful read from disk
- immediately prior to write IO submission
”h]”hŒbullet_list”“”)”}”(hhh]”(hŒ	list_item”“”)”}”(hŒ-immediately after a successful read from disk”h]”hé)”}”(hj'  h]”hŒ-immediately after a successful read from disk”…””}”(hj)  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hèhŸh³h K™hj%  ubah}”(h]”h ]”h"]”h$]”h&]”uh1j#  hj   ubj$  )”}”(hŒ)immediately prior to write IO submission
”h]”hé)”}”(hŒ(immediately prior to write IO submission”h]”hŒ(immediately prior to write IO submission”…””}”(hj@  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hèhŸh³h Kšhj<  ubah}”(h]”h ]”h"]”h$]”h&]”uh1j#  hj   ubeh}”(h]”h ]”h"]”h$]”h&]”Œbullet”Œ-”uh1j  hŸh³h K™hj  ubah}”(h]”h ]”h"]”h$]”h&]”uh1j  hŸh³h K™hjù  hžhubhé)”}”(hXw  The verification is completely stateless - it is done independently of the
modification process, and seeks only to check that the metadata is what it says
it is and that the metadata fields are within bounds and internally consistent.
As such, we cannot catch all types of corruption that can occur within a block
as there may be certain limitations that operational state enforces of the
metadata, or there may be corruption of interblock relationships (e.g. corrupted
sibling pointer lists). Hence we still need stateful checking in the main code
body, but in general most of the per-field validation is handled by the
verifiers.”h]”hXw  The verification is completely stateless - it is done independently of the
modification process, and seeks only to check that the metadata is what it says
it is and that the metadata fields are within bounds and internally consistent.
As such, we cannot catch all types of corruption that can occur within a block
as there may be certain limitations that operational state enforces of the
metadata, or there may be corruption of interblock relationships (e.g. corrupted
sibling pointer lists). Hence we still need stateful checking in the main code
body, but in general most of the per-field validation is handled by the
verifiers.”…””}”(hjb  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hèhŸh³h Kœhjù  hžhubhé)”}”(hXQ  For read verification, the caller needs to specify the expected type of metadata
that it should see, and the IO completion process verifies that the metadata
object matches what was expected. If the verification process fails, then it
marks the object being read as EFSCORRUPTED. The caller needs to catch this
error (same as for IO errors), and if it needs to take special action due to a
verification error it can do so by catching the EFSCORRUPTED error value. If we
need more discrimination of error type at higher levels, we can define new
error numbers for different errors as necessary.”h]”hXQ  For read verification, the caller needs to specify the expected type of metadata
that it should see, and the IO completion process verifies that the metadata
object matches what was expected. If the verification process fails, then it
marks the object being read as EFSCORRUPTED. The caller needs to catch this
error (same as for IO errors), and if it needs to take special action due to a
verification error it can do so by catching the EFSCORRUPTED error value. If we
need more discrimination of error type at higher levels, we can define new
error numbers for different errors as necessary.”…””}”(hjp  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hèhŸh³h K¦hjù  hžhubhé)”}”(hXÕ  The first step in read verification is checking the magic number and determining
whether CRC validating is necessary. If it is, the CRC32c is calculated and
compared against the value stored in the object itself. Once this is validated,
further checks are made against the location information, followed by extensive
object specific metadata validation. If any of these checks fail, then the
buffer is considered corrupt and the EFSCORRUPTED error is set appropriately.”h]”hXÕ  The first step in read verification is checking the magic number and determining
whether CRC validating is necessary. If it is, the CRC32c is calculated and
compared against the value stored in the object itself. Once this is validated,
further checks are made against the location information, followed by extensive
object specific metadata validation. If any of these checks fail, then the
buffer is considered corrupt and the EFSCORRUPTED error is set appropriately.”…””}”(hj~  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hèhŸh³h K¯hjù  hžhubhé)”}”(hX±  Write verification is the opposite of the read verification - first the object
is extensively verified and if it is OK we then update the LSN from the last
modification made to the object, After this, we calculate the CRC and insert it
into the object. Once this is done the write IO is allowed to continue. If any
error occurs during this process, the buffer is again marked with a EFSCORRUPTED
error for the higher layers to catch.”h]”hX±  Write verification is the opposite of the read verification - first the object
is extensively verified and if it is OK we then update the LSN from the last
modification made to the object, After this, we calculate the CRC and insert it
into the object. Once this is done the write IO is allowed to continue. If any
error occurs during this process, the buffer is again marked with a EFSCORRUPTED
error for the higher layers to catch.”…””}”(hjŒ  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hèhŸh³h K¶hjù  hžhubeh}”(h]”Œruntime-validation”ah ]”h"]”Œruntime validation”ah$]”h&]”uh1hÂhhÄhžhhŸh³h K•ubhÃ)”}”(hhh]”(hÈ)”}”(hŒ
Structures”h]”hŒ
Structures”…””}”(hj¥  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÇhj¢  hžhhŸh³h K¾ubhé)”}”(hŒHA typical on-disk structure needs to contain the following information::”h]”hŒGA typical on-disk structure needs to contain the following information:”…””}”(hj³  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hèhŸh³h KÀhj¢  hžhubhŒliteral_block”“”)”}”(hX  struct xfs_ondisk_hdr {
        __be32  magic;              /* magic number */
        __be32  crc;                /* CRC, not logged */
        uuid_t  uuid;               /* filesystem identifier */
        __be64  owner;              /* parent object */
        __be64  blkno;              /* location on disk */
        __be64  lsn;                /* last modification in log, not logged */
};”h]”hX  struct xfs_ondisk_hdr {
        __be32  magic;              /* magic number */
        __be32  crc;                /* CRC, not logged */
        uuid_t  uuid;               /* filesystem identifier */
        __be64  owner;              /* parent object */
        __be64  blkno;              /* location on disk */
        __be64  lsn;                /* last modification in log, not logged */
};”…””}”hjÃ  sbah}”(h]”h ]”h"]”h$]”h&]”h±h²uh1jÁ  hŸh³h KÂhj¢  hžhubhé)”}”(hX  Depending on the metadata, this information may be part of a header structure
separate to the metadata contents, or may be distributed through an existing
structure. The latter occurs with metadata that already contains some of this
information, such as the superblock and AG headers.”h]”hX  Depending on the metadata, this information may be part of a header structure
separate to the metadata contents, or may be distributed through an existing
structure. The latter occurs with metadata that already contains some of this
information, such as the superblock and AG headers.”…””}”(hjÑ  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hèhŸh³h KËhj¢  hžhubhé)”}”(hŒ„Other metadata may have different formats for the information, but the same
level of information is generally provided. For example:”h]”hŒ„Other metadata may have different formats for the information, but the same
level of information is generally provided. For example:”…””}”(hjß  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hèhŸh³h KÐhj¢  hžhubj  )”}”(hXÕ  - short btree blocks have a 32 bit owner (ag number) and a 32 bit block
  number for location. The two of these combined provide the same
  information as @owner and @blkno in eh above structure, but using 8
  bytes less space on disk.

- directory/attribute node blocks have a 16 bit magic number, and the
  header that contains the magic number has other information in it as
  well. hence the additional metadata headers change the overall format
  of the metadata.
”h]”j  )”}”(hhh]”(j$  )”}”(hŒäshort btree blocks have a 32 bit owner (ag number) and a 32 bit block
number for location. The two of these combined provide the same
information as @owner and @blkno in eh above structure, but using 8
bytes less space on disk.
”h]”hé)”}”(hŒãshort btree blocks have a 32 bit owner (ag number) and a 32 bit block
number for location. The two of these combined provide the same
information as @owner and @blkno in eh above structure, but using 8
bytes less space on disk.”h]”hŒãshort btree blocks have a 32 bit owner (ag number) and a 32 bit block
number for location. The two of these combined provide the same
information as @owner and @blkno in eh above structure, but using 8
bytes less space on disk.”…””}”(hjø  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hèhŸh³h KÓhjô  ubah}”(h]”h ]”h"]”h$]”h&]”uh1j#  hjñ  ubj$  )”}”(hŒàdirectory/attribute node blocks have a 16 bit magic number, and the
header that contains the magic number has other information in it as
well. hence the additional metadata headers change the overall format
of the metadata.
”h]”hé)”}”(hŒßdirectory/attribute node blocks have a 16 bit magic number, and the
header that contains the magic number has other information in it as
well. hence the additional metadata headers change the overall format
of the metadata.”h]”hŒßdirectory/attribute node blocks have a 16 bit magic number, and the
header that contains the magic number has other information in it as
well. hence the additional metadata headers change the overall format
of the metadata.”…””}”(hj  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hèhŸh³h KØhj  ubah}”(h]”h ]”h"]”h$]”h&]”uh1j#  hjñ  ubeh}”(h]”h ]”h"]”h$]”h&]”jZ  j[  uh1j  hŸh³h KÓhjí  ubah}”(h]”h ]”h"]”h$]”h&]”uh1j  hŸh³h KÓhj¢  hžhubhé)”}”(hŒ9A typical buffer read verifier is structured as follows::”h]”hŒ8A typical buffer read verifier is structured as follows:”…””}”(hj0  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hèhŸh³h KÝhj¢  hžhubjÂ  )”}”(hX  #define XFS_FOO_CRC_OFF             offsetof(struct xfs_ondisk_hdr, crc)

static void
xfs_foo_read_verify(
        struct xfs_buf      *bp)
{
    struct xfs_mount *mp = bp->b_mount;

        if ((xfs_sb_version_hascrc(&mp->m_sb) &&
            !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
                                        XFS_FOO_CRC_OFF)) ||
            !xfs_foo_verify(bp)) {
                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
                xfs_buf_ioerror(bp, EFSCORRUPTED);
        }
}”h]”hX  #define XFS_FOO_CRC_OFF             offsetof(struct xfs_ondisk_hdr, crc)

static void
xfs_foo_read_verify(
        struct xfs_buf      *bp)
{
    struct xfs_mount *mp = bp->b_mount;

        if ((xfs_sb_version_hascrc(&mp->m_sb) &&
            !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
                                        XFS_FOO_CRC_OFF)) ||
            !xfs_foo_verify(bp)) {
                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
                xfs_buf_ioerror(bp, EFSCORRUPTED);
        }
}”…””}”hj>  sbah}”(h]”h ]”h"]”h$]”h&]”h±h²uh1jÁ  hŸh³h Kßhj¢  hžhubhé)”}”(hŒàThe code ensures that the CRC is only checked if the filesystem has CRCs enabled
by checking the superblock of the feature bit, and then if the CRC verifies OK
(or is not needed) it verifies the actual contents of the block.”h]”hŒàThe code ensures that the CRC is only checked if the filesystem has CRCs enabled
by checking the superblock of the feature bit, and then if the CRC verifies OK
(or is not needed) it verifies the actual contents of the block.”…””}”(hjL  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hèhŸh³h Kðhj¢  hžhubhé)”}”(hŒÎThe verifier function will take a couple of different forms, depending on
whether the magic number can be used to determine the format of the block. In
the case it can't, the code is structured as follows::”h]”hŒÏThe verifier function will take a couple of different forms, depending on
whether the magic number can be used to determine the format of the block. In
the case it canâ€™t, the code is structured as follows:”…””}”(hjZ  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hèhŸh³h Kôhj¢  hžhubjÂ  )”}”(hX  static bool
xfs_foo_verify(
        struct xfs_buf              *bp)
{
        struct xfs_mount    *mp = bp->b_mount;
        struct xfs_ondisk_hdr       *hdr = bp->b_addr;

        if (hdr->magic != cpu_to_be32(XFS_FOO_MAGIC))
                return false;

        if (!xfs_sb_version_hascrc(&mp->m_sb)) {
                if (!uuid_equal(&hdr->uuid, &mp->m_sb.sb_uuid))
                        return false;
                if (bp->b_bn != be64_to_cpu(hdr->blkno))
                        return false;
                if (hdr->owner == 0)
                        return false;
        }

        /* object specific verification checks here */

        return true;
}”h]”hX  static bool
xfs_foo_verify(
        struct xfs_buf              *bp)
{
        struct xfs_mount    *mp = bp->b_mount;
        struct xfs_ondisk_hdr       *hdr = bp->b_addr;

        if (hdr->magic != cpu_to_be32(XFS_FOO_MAGIC))
                return false;

        if (!xfs_sb_version_hascrc(&mp->m_sb)) {
                if (!uuid_equal(&hdr->uuid, &mp->m_sb.sb_uuid))
                        return false;
                if (bp->b_bn != be64_to_cpu(hdr->blkno))
                        return false;
                if (hdr->owner == 0)
                        return false;
        }

        /* object specific verification checks here */

        return true;
}”…””}”hjh  sbah}”(h]”h ]”h"]”h$]”h&]”h±h²uh1jÁ  hŸh³h Køhj¢  hžhubhé)”}”(hŒ]If there are different magic numbers for the different formats, the verifier
will look like::”h]”hŒ\If there are different magic numbers for the different formats, the verifier
will look like:”…””}”(hjv  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hèhŸh³h Mhj¢  hžhubjÂ  )”}”(hX¤  static bool
xfs_foo_verify(
        struct xfs_buf              *bp)
{
        struct xfs_mount    *mp = bp->b_mount;
        struct xfs_ondisk_hdr       *hdr = bp->b_addr;

        if (hdr->magic == cpu_to_be32(XFS_FOO_CRC_MAGIC)) {
                if (!uuid_equal(&hdr->uuid, &mp->m_sb.sb_uuid))
                        return false;
                if (bp->b_bn != be64_to_cpu(hdr->blkno))
                        return false;
                if (hdr->owner == 0)
                        return false;
        } else if (hdr->magic != cpu_to_be32(XFS_FOO_MAGIC))
                return false;

        /* object specific verification checks here */

        return true;
}”h]”hX¤  static bool
xfs_foo_verify(
        struct xfs_buf              *bp)
{
        struct xfs_mount    *mp = bp->b_mount;
        struct xfs_ondisk_hdr       *hdr = bp->b_addr;

        if (hdr->magic == cpu_to_be32(XFS_FOO_CRC_MAGIC)) {
                if (!uuid_equal(&hdr->uuid, &mp->m_sb.sb_uuid))
                        return false;
                if (bp->b_bn != be64_to_cpu(hdr->blkno))
                        return false;
                if (hdr->owner == 0)
                        return false;
        } else if (hdr->magic != cpu_to_be32(XFS_FOO_MAGIC))
                return false;

        /* object specific verification checks here */

        return true;
}”…””}”hj„  sbah}”(h]”h ]”h"]”h$]”h&]”h±h²uh1jÁ  hŸh³h Mhj¢  hžhubhé)”}”(hŒ“Write verifiers are very similar to the read verifiers, they just do things in
the opposite order to the read verifiers. A typical write verifier::”h]”hŒ’Write verifiers are very similar to the read verifiers, they just do things in
the opposite order to the read verifiers. A typical write verifier:”…””}”(hj’  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hèhŸh³h M)hj¢  hžhubjÂ  )”}”(hX©  static void
xfs_foo_write_verify(
        struct xfs_buf      *bp)
{
        struct xfs_mount    *mp = bp->b_mount;
        struct xfs_buf_log_item     *bip = bp->b_fspriv;

        if (!xfs_foo_verify(bp)) {
                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
                xfs_buf_ioerror(bp, EFSCORRUPTED);
                return;
        }

        if (!xfs_sb_version_hascrc(&mp->m_sb))
                return;


        if (bip) {
                struct xfs_ondisk_hdr       *hdr = bp->b_addr;
                hdr->lsn = cpu_to_be64(bip->bli_item.li_lsn);
        }
        xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_FOO_CRC_OFF);
}”h]”hX©  static void
xfs_foo_write_verify(
        struct xfs_buf      *bp)
{
        struct xfs_mount    *mp = bp->b_mount;
        struct xfs_buf_log_item     *bip = bp->b_fspriv;

        if (!xfs_foo_verify(bp)) {
                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
                xfs_buf_ioerror(bp, EFSCORRUPTED);
                return;
        }

        if (!xfs_sb_version_hascrc(&mp->m_sb))
                return;


        if (bip) {
                struct xfs_ondisk_hdr       *hdr = bp->b_addr;
                hdr->lsn = cpu_to_be64(bip->bli_item.li_lsn);
        }
        xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_FOO_CRC_OFF);
}”…””}”hj   sbah}”(h]”h ]”h"]”h$]”h&]”h±h²uh1jÁ  hŸh³h M,hj¢  hžhubhé)”}”(hXb  This will verify the internal structure of the metadata before we go any
further, detecting corruptions that have occurred as the metadata has been
modified in memory. If the metadata verifies OK, and CRCs are enabled, we then
update the LSN field (when it was last modified) and calculate the CRC on the
metadata. Once this is done, we can issue the IO.”h]”hXb  This will verify the internal structure of the metadata before we go any
further, detecting corruptions that have occurred as the metadata has been
modified in memory. If the metadata verifies OK, and CRCs are enabled, we then
update the LSN field (when it was last modified) and calculate the CRC on the
metadata. Once this is done, we can issue the IO.”…””}”(hj®  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hèhŸh³h MDhj¢  hžhubeh}”(h]”Œ
structures”ah ]”h"]”Œ
structures”ah$]”h&]”uh1hÂhhÄhžhhŸh³h K¾ubhÃ)”}”(hhh]”(hÈ)”}”(hŒInodes and Dquots”h]”hŒInodes and Dquots”…””}”(hjÇ  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÇhjÄ  hžhhŸh³h MKubhé)”}”(hX  Inodes and dquots are special snowflakes. They have per-object CRC and
self-identifiers, but they are packed so that there are multiple objects per
buffer. Hence we do not use per-buffer verifiers to do the work of per-object
verification and CRC calculations. The per-buffer verifiers simply perform basic
identification of the buffer - that they contain inodes or dquots, and that
there are magic numbers in all the expected spots. All further CRC and
verification checks are done when each inode is read from or written back to the
buffer.”h]”hX  Inodes and dquots are special snowflakes. They have per-object CRC and
self-identifiers, but they are packed so that there are multiple objects per
buffer. Hence we do not use per-buffer verifiers to do the work of per-object
verification and CRC calculations. The per-buffer verifiers simply perform basic
identification of the buffer - that they contain inodes or dquots, and that
there are magic numbers in all the expected spots. All further CRC and
verification checks are done when each inode is read from or written back to the
buffer.”…””}”(hjÕ  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hèhŸh³h MMhjÄ  hžhubhé)”}”(hXï  The structure of the verifiers and the identifiers checks is very similar to the
buffer code described above. The only difference is where they are called. For
example, inode read verification is done in xfs_inode_from_disk() when the inode
is first read out of the buffer and the struct xfs_inode is instantiated. The
inode is already extensively verified during writeback in xfs_iflush_int, so the
only addition here is to add the LSN and CRC to the inode as it is copied back
into the buffer.”h]”hXï  The structure of the verifiers and the identifiers checks is very similar to the
buffer code described above. The only difference is where they are called. For
example, inode read verification is done in xfs_inode_from_disk() when the inode
is first read out of the buffer and the struct xfs_inode is instantiated. The
inode is already extensively verified during writeback in xfs_iflush_int, so the
only addition here is to add the LSN and CRC to the inode as it is copied back
into the buffer.”…””}”(hjã  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hèhŸh³h MVhjÄ  hžhubhé)”}”(hX4  XXX: inode unlinked list modification doesn't recalculate the inode CRC! None of
the unlinked list modifications check or update CRCs, neither during unlink nor
log recovery. So, it's gone unnoticed until now. This won't matter immediately -
repair will probably complain about it - but it needs to be fixed.”h]”hX:  XXX: inode unlinked list modification doesnâ€™t recalculate the inode CRC! None of
the unlinked list modifications check or update CRCs, neither during unlink nor
log recovery. So, itâ€™s gone unnoticed until now. This wonâ€™t matter immediately -
repair will probably complain about it - but it needs to be fixed.”…””}”(hjñ  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hèhŸh³h M^hjÄ  hžhubeh}”(h]”Œinodes-and-dquots”ah ]”h"]”Œinodes and dquots”ah$]”h&]”uh1hÂhhÄhžhhŸh³h MKubeh}”(h]”(hÁŒid1”eh ]”h"]”(Œxfs self describing metadata”Œxfs_self_describing_metadata”eh$]”h&]”uh1hÂhhhžhhŸh³h KŒexpect_referenced_by_name”}”j  h¶sŒexpect_referenced_by_id”}”hÁh¶subeh}”(h]”h ]”h"]”h$]”h&]”Œsource”h³uh1hŒcurrent_source”NŒcurrent_line”NŒsettings”Œdocutils.frontend”ŒValues”“”)”}”(hÇNŒ	generator”NŒ	datestamp”NŒsource_link”NŒ
source_url”NŒtoc_backlinks”Œentry”Œfootnote_backlinks”KŒsectnum_xform”KŒstrip_comments”NŒstrip_elements_with_classes”NŒstrip_classes”NŒreport_level”KŒ
halt_level”KŒexit_status_level”KŒdebug”NŒwarning_stream”NŒ	traceback”ˆŒinput_encoding”Œ	utf-8-sig”Œinput_encoding_error_handler”Œstrict”Œoutput_encoding”Œutf-8”Œoutput_encoding_error_handler”j7  Œerror_encoding”Œutf-8”Œerror_encoding_error_handler”Œbackslashreplace”Œlanguage_code”Œen”Œrecord_dependencies”NŒconfig”NŒ	id_prefix”hŒauto_id_prefix”Œid”Œdump_settings”NŒdump_internals”NŒdump_transforms”NŒdump_pseudo_xml”NŒexpose_internals”NŒstrict_visitor”NŒ_disable_config”NŒ_source”h³Œ_destination”NŒ_config_files”]”Œ7/var/lib/git/docbuild/linux/Documentation/docutils.conf”aŒfile_insertion_enabled”ˆŒraw_enabled”KŒline_length_limit”M'Œpep_references”NŒpep_base_url”Œhttps://peps.python.org/”Œpep_file_url_template”Œpep-%04d”Œrfc_references”NŒrfc_base_url”Œ&https://datatracker.ietf.org/doc/html/”Œ	tab_width”KŒtrim_footnote_reference_space”‰Œsyntax_highlight”Œlong”Œsmart_quotes”ˆŒsmartquotes_locales”]”Œcharacter_level_inline_markup”‰Œdoctitle_xform”‰Œdocinfo_xform”KŒsectsubtitle_xform”‰Œimage_loading”Œlink”Œembed_stylesheet”‰Œcloak_email_addresses”ˆŒsection_self_link”‰Œenv”NubŒreporter”NŒindirect_targets”]”Œsubstitution_defs”}”Œsubstitution_names”}”Œrefnames”}”Œrefids”}”hÁ]”h¶asŒnameids”}”(j  hÁj  j	  j'  j$  jö  jó  jŸ  jœ  jÁ  j¾  j  j  uŒ	nametypes”}”(j  ˆj  ‰j'  ‰jö  ‰jŸ  ‰jÁ  ‰j  ‰uh}”(hÁhÄj	  hÄj$  h×jó  j*  jœ  jù  j¾  j¢  j  jÄ  uŒfootnote_refs”}”Œcitation_refs”}”Œautofootnotes”]”Œautofootnote_refs”]”Œsymbol_footnotes”]”Œsymbol_footnote_refs”]”Œ	footnotes”]”Œ	citations”]”Œautofootnote_start”KŒsymbol_footnote_start”K Œ
id_counter”Œcollections”ŒCounter”“”}”jE  Ks…”R”Œparse_messages”]”Œtransform_messages”]”hŒsystem_message”“”)”}”(hhh]”hé)”}”(hhh]”hŒBHyperlink target "xfs-self-describing-metadata" is not referenced.”…””}”hj¡  sbah}”(h]”h ]”h"]”h$]”h&]”uh1hèhjž  ubah}”(h]”h ]”h"]”h$]”h&]”Œlevel”KŒtype”ŒINFO”Œsource”h³Œline”Kuh1jœ  ubaŒtransformer”NŒinclude_log”]”Œ
decoration”Nhžhub.