学习一个文件系统,需要熟悉它的数据layout,为此就必须深入理解layout相关的数据结构。结合本人最近学习f2fs的心得,下面总结了相关的几个最重要的数据结构。

基本概念

block: 4KB对齐且连续的物理存储空间
segment: 2M连续的物理存储空间
session: 若干连续的segment 组成
zone: 若干连续的zone组成

node

node是内部用来定位的。通过下面的数据结构可以看到,f2fs里面的node 主要就是用来记录block的地址。相关的数据结构如下:

struct f2fs_node {
/* can be one of three types: inode, direct, and indirect types */
union {
struct f2fs_inode i;
struct direct_node dn;
struct indirect_node in;
};
struct node_footer footer;
} __packed;
struct direct_node {
__le32 addr[ADDRS_PER_BLOCK]; /* array of data block address */
} __packed;
struct indirect_node {
__le32 nid[NIDS_PER_BLOCK]; /* array of data block address */
} __packed;

上面f2fs_node包含了以个union,里面可能是f2fs_inode。它里面的一个重要内容就是用来索引逻辑文件或者目录里的数据。具体结构如下。

inode

inode是用来和外部用户交互的,inode包括和VFS交互,包括ACL、time等相关数据信息。主要数据结构如下:

struct f2fs_inode {
__le16 i_mode; /* file mode */
__u8 i_advise; /* file hints */
__u8 i_inline; /* file inline flags */
__le32 i_uid; /* user ID */
__le32 i_gid; /* group ID */
__le32 i_links; /* links count */
__le64 i_size; /* file size in bytes */
__le64 i_blocks; /* file size in blocks */
__le64 i_atime; /* access time */
__le64 i_ctime; /* change time */
__le64 i_mtime; /* modification time */
__le32 i_atime_nsec; /* access time in nano scale */
__le32 i_ctime_nsec; /* change time in nano scale */
__le32 i_mtime_nsec; /* modification time in nano scale */
__le32 i_generation; /* file version (for NFS) */
union {
__le32 i_current_depth; /* only for directory depth */
__le16 i_gc_failures; /*
* # of gc failures on pinned file.
* only for regular files.
*/
};
__le32 i_xattr_nid; /* nid to save xattr */
__le32 i_flags; /* file attributes */
__le32 i_pino; /* parent inode number */
__le32 i_namelen; /* file name length */
__u8 i_name[F2FS_NAME_LEN]; /* file name for SPOR */
__u8 i_dir_level; /* dentry_level for large dir */
struct f2fs_extent i_ext; /* caching a largest extent */
union {
struct { // for what usage?
__le16 i_extra_isize; /* extra inode attribute size */
__le16 i_inline_xattr_size; /* inline xattr size, unit: 4 bytes */
__le32 i_projid; /* project id */
__le32 i_inode_checksum;/* inode meta checksum */
__le64 i_crtime; /* creation time */
__le32 i_extra_end[0]; /* for attribute size calculation */
} __packed;
__le32 i_addr[DEF_ADDRS_PER_INODE]; /* Pointers to data blocks */
};
__le32 i_nid[DEF_NIDS_PER_INODE]; /* direct(2), indirect(2),
double_indirect(1) node id */
} __packed;

其中上面的i_addr 直接可以指向数据块,如果数据块的数量超过了DEF_NIDS_PER_INODE,就需要使用i_nid。 i_nid 数组可以用来分别指向2个direct、2个indirect、1个double indirect的 block地址索引块。

NAT

上面f2fs_inode数据结构是一个inode块里面的内容。那么这个inode块的地址如何确定呢?这就是f2fs_nat_entry的职责了, 每个f2fs_nat_entry 记录了每个inode编号和其inode块数据地址的对应关系。而专门存储f2fs_nat_entry的block,组成了f2fs_nat_block。

/*
* For NAT entries
*/
#define NAT_ENTRY_PER_BLOCK (PAGE_SIZE / sizeof(struct f2fs_nat_entry))
struct f2fs_nat_entry {
__u8 version; /* latest version of cached nat entry */
__le32 ino; /* inode number */
__le32 block_addr; /* block address */
} __packed;
struct f2fs_nat_block {
struct f2fs_nat_entry entries[NAT_ENTRY_PER_BLOCK];
} __packed;

可是上面问题又来了, inode number如何确定?NAT block起始地址在哪,有多少个?

f2fs dir entry

f2fs_dir_entry 回答了上面的第一个问题,它把inode number和文件名通过hash关联起来了。同样,也有专门存储f2fs_dir_entry的块,叫做f2fs_dentry_block.

#define NR_DENTRY_IN_BLOCK 214 /* the number of dentry in a block */
#define SIZE_OF_DIR_ENTRY 11 /* by byte */
#define SIZE_OF_DENTRY_BITMAP ((NR_DENTRY_IN_BLOCK + BITS_PER_BYTE - 1) / \
BITS_PER_BYTE)
#define SIZE_OF_RESERVED (PAGE_SIZE - ((SIZE_OF_DIR_ENTRY + \
F2FS_SLOT_LEN) * \
NR_DENTRY_IN_BLOCK + SIZE_OF_DENTRY_BITMAP))
/* One directory entry slot representing F2FS_SLOT_LEN-sized file name */
struct f2fs_dir_entry {
__le32 hash_code; /* hash code of file name */
__le32 ino; /* inode number */
__le16 name_len; /* lengh of file name */
__u8 file_type; /* file type */
} __packed;
/* 4KB-sized directory entry block */
struct f2fs_dentry_block {
/* validity bitmap for directory entries in each block */
__u8 dentry_bitmap[SIZE_OF_DENTRY_BITMAP];
__u8 reserved[SIZE_OF_RESERVED];
struct f2fs_dir_entry dentry[NR_DENTRY_IN_BLOCK];
__u8 filename[NR_DENTRY_IN_BLOCK][F2FS_SLOT_LEN];
} __packed;

这里我之前的一个顾虑是,如果出现不同file name的inode hash到同一个inode number,岂不是出问题了?后来通过看具体实现的代码,可以看到,实际还会加上file name的比较。这样就可以避免碰撞了。
那么,f2fs第一个inode节点(root indoe)的inode number是怎么确定的?又存储在哪呢?

f2fs super block

f2fs super block数据结构回答了上面的问题,同时也记录了NAT block的起始地址。主要的数据结构如下:

struct f2fs_super_block {
__le32 magic; /* Magic Number */
__le16 major_ver; /* Major Version */
__le16 minor_ver; /* Minor Version */
__le32 log_sectorsize; /* log2 sector size in bytes */
__le32 log_sectors_per_block; /* log2 # of sectors per block */
__le32 log_blocksize; /* log2 block size in bytes */
__le32 log_blocks_per_seg; /* log2 # of blocks per segment */
__le32 segs_per_sec; /* # of segments per section */
__le32 secs_per_zone; /* # of sections per zone */
__le32 checksum_offset; /* checksum offset inside super block */
__le64 block_count; /* total # of user blocks */
__le32 section_count; /* total # of sections */
__le32 segment_count; /* total # of segments */
__le32 segment_count_ckpt; /* # of segments for checkpoint */
__le32 segment_count_sit; /* # of segments for SIT */
__le32 segment_count_nat; /* # of segments for NAT */
__le32 segment_count_ssa; /* # of segments for SSA */
__le32 segment_count_main; /* # of segments for main area */
__le32 segment0_blkaddr; /* start block address of segment 0 */
__le32 cp_blkaddr; /* start block address of checkpoint */
__le32 sit_blkaddr; /* start block address of SIT */
__le32 nat_blkaddr; /* start block address of NAT */
__le32 ssa_blkaddr; /* start block address of SSA */
__le32 main_blkaddr; /* start block address of main area */
__le32 root_ino; /* root inode number */
__le32 node_ino; /* node inode number */
__le32 meta_ino; /* meta inode number */
__u8 uuid[16]; /* 128-bit uuid for volume */
__le16 volume_name[MAX_VOLUME_NAME]; /* volume name */
__le32 extension_count; /* # of extensions below */
__u8 extension_list[F2FS_MAX_EXTENSION][F2FS_EXTENSION_LEN];/* extension array */
__le32 cp_payload;
__u8 version[VERSION_LEN]; /* the kernel version */
__u8 init_version[VERSION_LEN]; /* the initial kernel version */
__le32 feature; /* defined features */
__u8 encryption_level; /* versioning level for encryption */
__u8 encrypt_pw_salt[16]; /* Salt used for string2key algorithm */
struct f2fs_device devs[MAX_DEVICES]; /* device list */
__le32 qf_ino[F2FS_MAX_QUOTAS]; /* quota inode numbers */
__u8 hot_ext_count; /* # of hot file extension */
__u8 reserved[314]; /* valid reserved region */
} __packed;

而super block的位置是固定的,当以f2fs格式化一个磁盘的时候,它会写入到磁盘固定偏移的地方。

SIT

由于f2fs是LFS,追加的写的大小不固定,很可能小于一个segment的大小,这就需要记录哪些block已经使用。segment info table 就是做这个事情的,里面的valid_map记录了有效的块。

/*
* Note that f2fs_sit_entry->vblocks has the following bit-field information.
* [15:10] : allocation type such as CURSEG_XXXX_TYPE
* [9:0] : valid block count
*/
#define SIT_VBLOCKS_SHIFT 10
#define SIT_VBLOCKS_MASK ((1 < < SIT_VBLOCKS_SHIFT) - 1)
#define GET_SIT_VBLOCKS(raw_sit) \
(le16_to_cpu((raw_sit)->vblocks) & SIT_VBLOCKS_MASK)
#define GET_SIT_TYPE(raw_sit) \
((le16_to_cpu((raw_sit)->vblocks) & ~SIT_VBLOCKS_MASK) \
>> SIT_VBLOCKS_SHIFT)
struct f2fs_sit_entry {
__le16 vblocks; /* reference above */
__u8 valid_map[SIT_VBLOCK_MAP_SIZE]; /* bitmap for valid blocks */
__le64 mtime; /* segment age for cleaning */
} __packed;
struct f2fs_sit_block {
struct f2fs_sit_entry entries[SIT_ENTRY_PER_BLOCK];
} __packed;

segment summary

f2fs 一个重要的设计特色就是避免了对传统LFS 的wandering tree问题,这个主要是通过segment summary 相关的数据结构实现的。
通过上面SIT的介绍,一次写之后,需要更新对应的SIT。这个更新会记录到 f2fs_sit_journal_entry中:

struct sit_journal_entry {
__le32 segno;
struct f2fs_sit_entry se;
} __packed;
struct sit_journal {
struct sit_journal_entry entries[SIT_JOURNAL_ENTRIES];
__u8 reserved[SIT_JOURNAL_RESERVED];
} __packed;

如果新建一个文件或目录,并且有写操作,就需要更新nat 区域。同样对这个inode的更新也会记录到f2fs_nat_journal_entry中:

struct nat_journal_entry {
__le32 nid;
struct f2fs_nat_entry ne;
} __packed;
struct nat_journal {
struct nat_journal_entry entries[NAT_JOURNAL_ENTRIES];
__u8 reserved[NAT_JOURNAL_RESERVED];
}

一个写操作,其实对NAT/SIT更新的区域的很小。如果每次都直接更新这两个区域,对SSD会导致比较大的写放大。为了避免这个问题,f2fs 通过segment summary把这些零星的写攒到segment summary 区域。

/*
* For segment summary
*
* One summary block contains exactly 512 summary entries, which represents
* exactly 2MB segment by default. Not allow to change the basic units.
*
* NOTE: For initializing fields, you must use set_summary
*
* - If data page, nid represents dnode's nid
* - If node page, nid represents the node page's nid.
*
* The ofs_in_node is used by only data page. It represents offset
* from node's page's beginning to get a data block address.
* ex) data_blkaddr = (block_t)(nodepage_start_address + ofs_in_node)
*/
#define ENTRIES_IN_SUM 512
#define SUMMARY_SIZE (7) /* sizeof(struct summary) */
#define SUM_FOOTER_SIZE (5) /* sizeof(struct summary_footer) */
#define SUM_ENTRY_SIZE (SUMMARY_SIZE * ENTRIES_IN_SUM)
/* a summary entry for a 4KB-sized block in a segment */
struct f2fs_summary {
__le32 nid; /* parent node id */
union {
__u8 reserved[3];
struct {
__u8 version; /* node version number */
__le16 ofs_in_node; /* block index in parent node */
} __packed;
};
} __packed;
/* summary block type, node or data, is stored to the summary_footer */
#define SUM_TYPE_NODE (1)
#define SUM_TYPE_DATA (0)
struct summary_footer {
unsigned char entry_type; /* SUM_TYPE_XXX */
__le32 check_sum; /* summary checksum */
} __packed;
#define SUM_JOURNAL_SIZE (F2FS_BLKSIZE - SUM_FOOTER_SIZE -\
SUM_ENTRY_SIZE)
#define NAT_JOURNAL_ENTRIES ((SUM_JOURNAL_SIZE - 2) /\
sizeof(struct nat_journal_entry))
#define NAT_JOURNAL_RESERVED ((SUM_JOURNAL_SIZE - 2) %\
sizeof(struct nat_journal_entry))
#define SIT_JOURNAL_ENTRIES ((SUM_JOURNAL_SIZE - 2) /\
sizeof(struct sit_journal_entry))
#define SIT_JOURNAL_RESERVED ((SUM_JOURNAL_SIZE - 2) %\
sizeof(struct sit_journal_entry))
/* Reserved area should make size of f2fs_extra_info equals to
* that of nat_journal and sit_journal.
*/
#define EXTRA_INFO_RESERVED (SUM_JOURNAL_SIZE - 2 - 8)
/*
* frequently updated NAT/SIT entries can be stored in the spare area in
* summary blocks
*/
enum {
NAT_JOURNAL = 0,
SIT_JOURNAL
};
struct f2fs_extra_info {
__le64 kbytes_written;
__u8 reserved[EXTRA_INFO_RESERVED];
} __packed;
struct f2fs_journal {
union {
__le16 n_nats;
__le16 n_sits;
};
/* spare area is used by NAT or SIT journals or extra info */
union {
struct nat_journal nat_j;
struct sit_journal sit_j;
struct f2fs_extra_info info;
};
} __packed;
/* 4KB-sized summary block structure */
struct f2fs_summary_block {
struct f2fs_summary entries[ENTRIES_IN_SUM]; // 512 entry * 6 bytes per entry ,used to recor where has modifiaction
struct f2fs_journal journal;
struct summary_footer footer;
} __packed;

file 相关操作

f2fs.h:
file_operations f2fs_file_operations:
fs/f2fs/file.c:

const struct file_operations f2fs_file_operations = {
.llseek = f2fs_llseek,
.read_iter = generic_file_read_iter,
.write_iter = f2fs_file_write_iter,
.open = f2fs_file_open,
.release = f2fs_release_file,
.mmap = f2fs_file_mmap,
.flush = f2fs_file_flush,
.fsync = f2fs_sync_file,
.fallocate = f2fs_fallocate,
.unlocked_ioctl = f2fs_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = f2fs_compat_ioctl,
#endif
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
};

参考

include/linux/f2fs_fs.h
fs/f2fs/
Documentation/filesystems/f2fs.txt