1月 29 2020

学习一个文件系统，需要熟悉它的数据layout，为此就必须深入理解layout相关的数据结构。结合本人最近学习f2fs的心得，下面总结了相关的几个最重要的数据结构。

基本概念

block: 4KB对齐且连续的物理存储空间
segment: 2M连续的物理存储空间
session: 若干连续的segment 组成
zone: 若干连续的zone组成

node

node是内部用来定位的。通过下面的数据结构可以看到，f2fs里面的node 主要就是用来记录block的地址。相关的数据结构如下：

struct f2fs_node {
/* can be one of three types: inode, direct, and indirect types */
union {
struct f2fs_inode i;
struct direct_node dn;
struct indirect_node in;
};
struct node_footer footer;
} __packed;
struct direct_node {
__le32 addr[ADDRS_PER_BLOCK]; /* array of data block address */
} __packed;
struct indirect_node {
__le32 nid[NIDS_PER_BLOCK]; /* array of data block address */
} __packed;

上面f2fs_node包含了以个union，里面可能是f2fs_inode。它里面的一个重要内容就是用来索引逻辑文件或者目录里的数据。具体结构如下。

inode

inode是用来和外部用户交互的，inode包括和VFS交互，包括ACL、time等相关数据信息。主要数据结构如下：

struct f2fs_inode {
__le16 i_mode; /* file mode */
__u8 i_advise; /* file hints */
__u8 i_inline; /* file inline flags */
__le32 i_uid; /* user ID */
__le32 i_gid; /* group ID */
__le32 i_links; /* links count */
__le64 i_size; /* file size in bytes */
__le64 i_blocks; /* file size in blocks */
__le64 i_atime; /* access time */
__le64 i_ctime; /* change time */
__le64 i_mtime; /* modification time */
__le32 i_atime_nsec; /* access time in nano scale */
__le32 i_ctime_nsec; /* change time in nano scale */
__le32 i_mtime_nsec; /* modification time in nano scale */
__le32 i_generation; /* file version (for NFS) */
union {
__le32 i_current_depth; /* only for directory depth */
__le16 i_gc_failures; /*
* # of gc failures on pinned file.
* only for regular files.
*/
};
__le32 i_xattr_nid; /* nid to save xattr */
__le32 i_flags; /* file attributes */
__le32 i_pino; /* parent inode number */
__le32 i_namelen; /* file name length */
__u8 i_name[F2FS_NAME_LEN]; /* file name for SPOR */
__u8 i_dir_level; /* dentry_level for large dir */
struct f2fs_extent i_ext; /* caching a largest extent */
union {
struct { // for what usage?
__le16 i_extra_isize; /* extra inode attribute size */
__le16 i_inline_xattr_size; /* inline xattr size, unit: 4 bytes */
__le32 i_projid; /* project id */
__le32 i_inode_checksum;/* inode meta checksum */
__le64 i_crtime; /* creation time */
__le32 i_extra_end[0]; /* for attribute size calculation */
} __packed;
__le32 i_addr[DEF_ADDRS_PER_INODE]; /* Pointers to data blocks */
};
__le32 i_nid[DEF_NIDS_PER_INODE]; /* direct(2), indirect(2),
double_indirect(1) node id */
} __packed;

其中上面的i_addr 直接可以指向数据块，如果数据块的数量超过了DEF_NIDS_PER_INODE，就需要使用i_nid。 i_nid 数组可以用来分别指向2个direct、2个indirect、1个double indirect的 block地址索引块。

NAT

上面f2fs_inode数据结构是一个inode块里面的内容。那么这个inode块的地址如何确定呢？这就是f2fs_nat_entry的职责了, 每个f2fs_nat_entry 记录了每个inode编号和其inode块数据地址的对应关系。而专门存储f2fs_nat_entry的block，组成了f2fs_nat_block。

/*
* For NAT entries
*/
#define NAT_ENTRY_PER_BLOCK (PAGE_SIZE / sizeof(struct f2fs_nat_entry))
struct f2fs_nat_entry {
__u8 version; /* latest version of cached nat entry */
__le32 ino; /* inode number */
__le32 block_addr; /* block address */
} __packed;
struct f2fs_nat_block {
struct f2fs_nat_entry entries[NAT_ENTRY_PER_BLOCK];
} __packed;

可是上面问题又来了， inode number如何确定？NAT block起始地址在哪，有多少个？

f2fs dir entry

f2fs_dir_entry 回答了上面的第一个问题，它把inode number和文件名通过hash关联起来了。同样，也有专门存储f2fs_dir_entry的块，叫做f2fs_dentry_block.

#define NR_DENTRY_IN_BLOCK 214 /* the number of dentry in a block */
#define SIZE_OF_DIR_ENTRY 11 /* by byte */
#define SIZE_OF_DENTRY_BITMAP ((NR_DENTRY_IN_BLOCK + BITS_PER_BYTE - 1) / \
BITS_PER_BYTE)
#define SIZE_OF_RESERVED (PAGE_SIZE - ((SIZE_OF_DIR_ENTRY + \
F2FS_SLOT_LEN) * \
NR_DENTRY_IN_BLOCK + SIZE_OF_DENTRY_BITMAP))
/* One directory entry slot representing F2FS_SLOT_LEN-sized file name */
struct f2fs_dir_entry {
__le32 hash_code; /* hash code of file name */
__le32 ino; /* inode number */
__le16 name_len; /* lengh of file name */
__u8 file_type; /* file type */
} __packed;
/* 4KB-sized directory entry block */
struct f2fs_dentry_block {
/* validity bitmap for directory entries in each block */
__u8 dentry_bitmap[SIZE_OF_DENTRY_BITMAP];
__u8 reserved[SIZE_OF_RESERVED];
struct f2fs_dir_entry dentry[NR_DENTRY_IN_BLOCK];
__u8 filename[NR_DENTRY_IN_BLOCK][F2FS_SLOT_LEN];
} __packed;

这里我之前的一个顾虑是，如果出现不同file name的inode hash到同一个inode number，岂不是出问题了？后来通过看具体实现的代码，可以看到，实际还会加上file name的比较。这样就可以避免碰撞了。
那么，f2fs第一个inode节点（root indoe)的inode number是怎么确定的？又存储在哪呢？

f2fs super block

f2fs super block数据结构回答了上面的问题，同时也记录了NAT block的起始地址。主要的数据结构如下：

struct f2fs_super_block {
__le32 magic; /* Magic Number */
__le16 major_ver; /* Major Version */
__le16 minor_ver; /* Minor Version */
__le32 log_sectorsize; /* log2 sector size in bytes */
__le32 log_sectors_per_block; /* log2 # of sectors per block */
__le32 log_blocksize; /* log2 block size in bytes */
__le32 log_blocks_per_seg; /* log2 # of blocks per segment */
__le32 segs_per_sec; /* # of segments per section */
__le32 secs_per_zone; /* # of sections per zone */
__le32 checksum_offset; /* checksum offset inside super block */
__le64 block_count; /* total # of user blocks */
__le32 section_count; /* total # of sections */
__le32 segment_count; /* total # of segments */
__le32 segment_count_ckpt; /* # of segments for checkpoint */
__le32 segment_count_sit; /* # of segments for SIT */
__le32 segment_count_nat; /* # of segments for NAT */
__le32 segment_count_ssa; /* # of segments for SSA */
__le32 segment_count_main; /* # of segments for main area */
__le32 segment0_blkaddr; /* start block address of segment 0 */
__le32 cp_blkaddr; /* start block address of checkpoint */
__le32 sit_blkaddr; /* start block address of SIT */
__le32 nat_blkaddr; /* start block address of NAT */
__le32 ssa_blkaddr; /* start block address of SSA */
__le32 main_blkaddr; /* start block address of main area */
__le32 root_ino; /* root inode number */
__le32 node_ino; /* node inode number */
__le32 meta_ino; /* meta inode number */
__u8 uuid[16]; /* 128-bit uuid for volume */
__le16 volume_name[MAX_VOLUME_NAME]; /* volume name */
__le32 extension_count; /* # of extensions below */
__u8 extension_list[F2FS_MAX_EXTENSION][F2FS_EXTENSION_LEN];/* extension array */
__le32 cp_payload;
__u8 version[VERSION_LEN]; /* the kernel version */
__u8 init_version[VERSION_LEN]; /* the initial kernel version */
__le32 feature; /* defined features */
__u8 encryption_level; /* versioning level for encryption */
__u8 encrypt_pw_salt[16]; /* Salt used for string2key algorithm */
struct f2fs_device devs[MAX_DEVICES]; /* device list */
__le32 qf_ino[F2FS_MAX_QUOTAS]; /* quota inode numbers */
__u8 hot_ext_count; /* # of hot file extension */
__u8 reserved[314]; /* valid reserved region */
} __packed;

而super block的位置是固定的，当以f2fs格式化一个磁盘的时候，它会写入到磁盘固定偏移的地方。

SIT

由于f2fs是LFS，追加的写的大小不固定，很可能小于一个segment的大小，这就需要记录哪些block已经使用。segment info table 就是做这个事情的，里面的valid_map记录了有效的块。

/*
* Note that f2fs_sit_entry->vblocks has the following bit-field information.
* [15:10] : allocation type such as CURSEG_XXXX_TYPE
* [9:0] : valid block count
*/
#define SIT_VBLOCKS_SHIFT 10
#define SIT_VBLOCKS_MASK ((1 < < SIT_VBLOCKS_SHIFT) - 1)
#define GET_SIT_VBLOCKS(raw_sit) \
(le16_to_cpu((raw_sit)->vblocks) & SIT_VBLOCKS_MASK)
#define GET_SIT_TYPE(raw_sit) \
((le16_to_cpu((raw_sit)->vblocks) & ~SIT_VBLOCKS_MASK) \
>> SIT_VBLOCKS_SHIFT)
struct f2fs_sit_entry {
__le16 vblocks; /* reference above */
__u8 valid_map[SIT_VBLOCK_MAP_SIZE]; /* bitmap for valid blocks */
__le64 mtime; /* segment age for cleaning */
} __packed;
struct f2fs_sit_block {
struct f2fs_sit_entry entries[SIT_ENTRY_PER_BLOCK];
} __packed;

segment summary

f2fs 一个重要的设计特色就是避免了对传统LFS 的wandering tree问题，这个主要是通过segment summary 相关的数据结构实现的。
通过上面SIT的介绍，一次写之后，需要更新对应的SIT。这个更新会记录到 f2fs_sit_journal_entry中：

struct sit_journal_entry {
__le32 segno;
struct f2fs_sit_entry se;
} __packed;
struct sit_journal {
struct sit_journal_entry entries[SIT_JOURNAL_ENTRIES];
__u8 reserved[SIT_JOURNAL_RESERVED];
} __packed;

如果新建一个文件或目录，并且有写操作，就需要更新nat 区域。同样对这个inode的更新也会记录到f2fs_nat_journal_entry中：

struct nat_journal_entry {
__le32 nid;
struct f2fs_nat_entry ne;
} __packed;
struct nat_journal {
struct nat_journal_entry entries[NAT_JOURNAL_ENTRIES];
__u8 reserved[NAT_JOURNAL_RESERVED];
}

一个写操作，其实对NAT/SIT更新的区域的很小。如果每次都直接更新这两个区域，对SSD会导致比较大的写放大。为了避免这个问题，f2fs 通过segment summary把这些零星的写攒到segment summary 区域。

/*
* For segment summary
*
* One summary block contains exactly 512 summary entries, which represents
* exactly 2MB segment by default. Not allow to change the basic units.
*
* NOTE: For initializing fields, you must use set_summary
*
* - If data page, nid represents dnode's nid
* - If node page, nid represents the node page's nid.
*
* The ofs_in_node is used by only data page. It represents offset
* from node's page's beginning to get a data block address.
* ex) data_blkaddr = (block_t)(nodepage_start_address + ofs_in_node)
*/
#define ENTRIES_IN_SUM 512
#define SUMMARY_SIZE (7) /* sizeof(struct summary) */
#define SUM_FOOTER_SIZE (5) /* sizeof(struct summary_footer) */
#define SUM_ENTRY_SIZE (SUMMARY_SIZE * ENTRIES_IN_SUM)
/* a summary entry for a 4KB-sized block in a segment */
struct f2fs_summary {
__le32 nid; /* parent node id */
union {
__u8 reserved[3];
struct {
__u8 version; /* node version number */
__le16 ofs_in_node; /* block index in parent node */
} __packed;
};
} __packed;
/* summary block type, node or data, is stored to the summary_footer */
#define SUM_TYPE_NODE (1)
#define SUM_TYPE_DATA (0)
struct summary_footer {
unsigned char entry_type; /* SUM_TYPE_XXX */
__le32 check_sum; /* summary checksum */
} __packed;
#define SUM_JOURNAL_SIZE (F2FS_BLKSIZE - SUM_FOOTER_SIZE -\
SUM_ENTRY_SIZE)
#define NAT_JOURNAL_ENTRIES ((SUM_JOURNAL_SIZE - 2) /\
sizeof(struct nat_journal_entry))
#define NAT_JOURNAL_RESERVED ((SUM_JOURNAL_SIZE - 2) %\
sizeof(struct nat_journal_entry))
#define SIT_JOURNAL_ENTRIES ((SUM_JOURNAL_SIZE - 2) /\
sizeof(struct sit_journal_entry))
#define SIT_JOURNAL_RESERVED ((SUM_JOURNAL_SIZE - 2) %\
sizeof(struct sit_journal_entry))
/* Reserved area should make size of f2fs_extra_info equals to
* that of nat_journal and sit_journal.
*/
#define EXTRA_INFO_RESERVED (SUM_JOURNAL_SIZE - 2 - 8)
/*
* frequently updated NAT/SIT entries can be stored in the spare area in
* summary blocks
*/
enum {
NAT_JOURNAL = 0,
SIT_JOURNAL
};
struct f2fs_extra_info {
__le64 kbytes_written;
__u8 reserved[EXTRA_INFO_RESERVED];
} __packed;
struct f2fs_journal {
union {
__le16 n_nats;
__le16 n_sits;
};
/* spare area is used by NAT or SIT journals or extra info */
union {
struct nat_journal nat_j;
struct sit_journal sit_j;
struct f2fs_extra_info info;
};
} __packed;
/* 4KB-sized summary block structure */
struct f2fs_summary_block {
struct f2fs_summary entries[ENTRIES_IN_SUM]; // 512 entry * 6 bytes per entry ,used to recor where has modifiaction
struct f2fs_journal journal;
struct summary_footer footer;
} __packed;

file 相关操作

f2fs.h:
file_operations f2fs_file_operations:
fs/f2fs/file.c:

const struct file_operations f2fs_file_operations = {
.llseek = f2fs_llseek,
.read_iter = generic_file_read_iter,
.write_iter = f2fs_file_write_iter,
.open = f2fs_file_open,
.release = f2fs_release_file,
.mmap = f2fs_file_mmap,
.flush = f2fs_file_flush,
.fsync = f2fs_sync_file,
.fallocate = f2fs_fallocate,
.unlocked_ioctl = f2fs_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = f2fs_compat_ioctl,
#endif
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
};

参考

include/linux/f2fs_fs.h
fs/f2fs/
Documentation/filesystems/f2fs.txt

Post Views: 483

发表于存储相关技术

一	二	三	四	五	六	日
« 3月
		1	2	3	4	5
6	7	8	9	10	11	12
13	14	15	16	17	18	19
20	21	22	23	24	25	26
27	28	29	30	31

f2fs系列之二：重要的数据结构

基本概念

node

inode

NAT

f2fs dir entry

f2fs super block

SIT

segment summary

file 相关操作

参考

About The Author

发表评论取消回复

f2fs系列之二： 重要的数据结构

基本概念

node

inode

NAT

f2fs dir entry

f2fs super block

SIT

segment summary

file 相关操作

参考

About The Author

发表评论 取消回复

f2fs系列之二：重要的数据结构

发表评论取消回复