IO FILE fread

fread

首先感谢一下ray-cp师傅的文章 https://www.anquanke.com/post/id/177958#h2-5

测试程序

#include<stdio.h>

int main(){
char data[20];
FILE*fp=fopen("./text","rb");
fread(data,1,20,fp);
return 0;
}
size_t fread ( void *buffer, size_t size, size_t count, FILE *stream) ;

libio/iofread.c

weak_alias(_IO_fread , fread);

_IO_fread

_IO_size_t
_IO_fread (void *buf, _IO_size_t size, _IO_size_t count, _IO_FILE *fp)
{
_IO_size_t bytes_requested = size * count;
_IO_size_t bytes_read;
CHECK_FILE (fp, 0);
if (bytes_requested == 0)
return 0;
_IO_acquire_lock (fp);
bytes_read = _IO_sgetn (fp, (char *) buf, bytes_requested);
_IO_release_lock (fp);
return bytes_requested == bytes_read ? count : bytes_read / size;
}
libc_hidden_def (_IO_fread)

首先CHECK_FILE检测fp合法性

#ifdef IO_DEBUG
# define CHECK_FILE(FILE, RET) \
if ((FILE) == NULL) { MAYBE_SET_EINVAL; return RET; } \
else { COERCE_FILE(FILE); \
if (((FILE)->_IO_file_flags & _IO_MAGIC_MASK) != _IO_MAGIC) \
{ MAYBE_SET_EINVAL; return RET; }}
#else
# define CHECK_FILE(FILE, RET) COERCE_FILE (FILE)
#endif

如果定义了IO_DEBUG

FILE为空直接返回,否则检测FILE的flag魔数是否匹配

回到_IO_fread进行加锁后调用_IO_sgetn

libio/genops.c

_IO_sgetn

_IO_size_t
_IO_sgetn (_IO_FILE *fp, void *data, _IO_size_t n)
{
/* FIXME handle putback buffer here! */
return _IO_XSGETN (fp, data, n);
}

libio/libioP.h

#define _IO_XSGETN(FP, DATA, N) JUMP2 (__xsgetn, FP, DATA, N)

#define JUMP2(FUNC, THIS, X1, X2) (_IO_JUMPS_FUNC(THIS)->FUNC) (THIS, X1, X2)

#define _IO_JUMPS_FUNC(THIS) _IO_JUMPS_FILE_plus (THIS)

#define _IO_JUMPS_FILE_plus(THIS) \
_IO_CAST_FIELD_ACCESS ((THIS), struct _IO_FILE_plus, vtable)
#define _IO_CAST_FIELD_ACCESS(THIS, TYPE, MEMBER) \
(*(_IO_MEMBER_TYPE (TYPE, MEMBER) *)(((char *) (THIS)) \
+ offsetof(TYPE, MEMBER)))
#define _IO_MEMBER_TYPE(TYPE, MEMBER) __typeof__ (((TYPE){}).MEMBER)

盖亚,fucking macro

((struct _IO_FILE_plus){}) 创建了一个匿名的 _IO_FILE_plus 结构体,这个结构体里面只有一个 vtable 成员,其余成员都是默认值。

((*(__typeof__ (((struct _IO_FILE_plus){}).vtable) *)(((char *) ((fp))) + offsetof(struct _IO_FILE_plus, vtable)))->__xsgetn) (fp, data, n);

最终调用虚表里的__xsgetn指向的_IO_file_xsgetn


libio/fileops.c

_IO_file_xsgetn

_IO_size_t
_IO_file_xsgetn(_IO_FILE* fp, void* data, _IO_size_t n) {
_IO_size_t want, have;
_IO_ssize_t count;
char* s = data;

want = n;

if (fp->_IO_buf_base == NULL) {
/* Maybe we already have a push back pointer. */
if (fp->_IO_save_base != NULL) {
free(fp->_IO_save_base);
fp->_flags &= ~_IO_IN_BACKUP;
}
_IO_doallocbuf(fp);
}

while (want > 0) {
have = fp->_IO_read_end - fp->_IO_read_ptr;
if (want <= have) {
memcpy(s, fp->_IO_read_ptr, want);
fp->_IO_read_ptr += want;
want = 0;
}
else {
if (have > 0) {
#ifdef _LIBC
s = __mempcpy(s, fp->_IO_read_ptr, have);
#else
memcpy(s, fp->_IO_read_ptr, have);
s += have;
#endif
want -= have;
fp->_IO_read_ptr += have;
}

/* Check for backup and repeat */
if (_IO_in_backup(fp)) {
_IO_switch_to_main_get_area(fp);
continue;
}

/* If we now want less than a buffer, underflow and repeat
the copy. Otherwise, _IO_SYSREAD directly to
the user buffer. */
if (fp->_IO_buf_base
&& want < (size_t)(fp->_IO_buf_end - fp->_IO_buf_base)) {
if (__underflow(fp) == EOF)
break;

continue;
}

/* These must be set before the sysread as we might longjmp out
waiting for input. */
_IO_setg(fp, fp->_IO_buf_base, fp->_IO_buf_base, fp->_IO_buf_base);
_IO_setp(fp, fp->_IO_buf_base, fp->_IO_buf_base);

/* Try to maintain alignment: read a whole number of blocks. */
count = want;
if (fp->_IO_buf_base) {
_IO_size_t block_size = fp->_IO_buf_end - fp->_IO_buf_base;
if (block_size >= 128)
count -= want % block_size;
}

count = _IO_SYSREAD(fp, s, count);
if (count <= 0) {
if (count == 0)
fp->_flags |= _IO_EOF_SEEN;
else
fp->_flags |= _IO_ERR_SEEN;

break;
}

s += count;
want -= count;
if (fp->_offset != _IO_pos_BAD)
_IO_pos_adjust(fp->_offset, count);
}
}

return n - want;
}
libc_hidden_def(_IO_file_xsgetn)

初始化输入缓冲区

先检测是否有备份的缓冲区,有则将缓冲区释放掉,并把_IO_IN_BACKUP位置空
当程序需要从一个IO流中读取下一个字符时,它通常会从缓冲区中读取一个字符,如果缓冲区为空,则会触发一个IO操作,将更多的数据读入缓冲区中。但是在某些情况下,程序需要读取缓冲区中的备用字符,而不是从IO流中读取。此时,_IO_IN_BACKUP标志就会被设置,以指示下一个字符是备用字符而不是IO流中的字符。

然后调用_IO_doallocbuf(buf)

libio/genops.c

void
_IO_doallocbuf (_IO_FILE *fp)
{
if (fp->_IO_buf_base)
return;
if (!(fp->_flags & _IO_UNBUFFERED) || fp->_mode > 0)
if (_IO_DOALLOCATE (fp) != EOF)
return;
_IO_setb (fp, fp->_shortbuf, fp->_shortbuf+1, 0);
}
libc_hidden_def (_IO_doallocbuf)

检测值后调用_IO_DOALLOCATE

libio/libioP.h

#define  _IO_DOALLOCATE(FP)  JUMP0(__doallocate , FP)

根据前面的经验调用虚表中的**__doallocate指向的_IO_file_doallocate**函数

libio/filedoalloc.c

int
_IO_file_doallocate(_IO_FILE* fp) {
_IO_size_t size;
char* p;
struct stat64 st;

#ifndef _LIBC
/* If _IO_cleanup_registration_needed is non-zero, we should call the
function it points to. This is to make sure _IO_cleanup gets called
on exit. We call it from _IO_file_doallocate, since that is likely
to get called by any program that does buffered I/O. */
if (__glibc_unlikely(_IO_cleanup_registration_needed != NULL))
(*_IO_cleanup_registration_needed) ();
#endif

size = _IO_BUFSIZ;
if (fp->_fileno >= 0 && __builtin_expect(_IO_SYSSTAT(fp, &st), 0) >= 0) {
if (S_ISCHR(st.st_mode)) {
/* Possibly a tty. */
if (
#ifdef DEV_TTY_P
DEV_TTY_P(&st) ||
#endif
local_isatty(fp->_fileno))
fp->_flags |= _IO_LINE_BUF;
}
#if _IO_HAVE_ST_BLKSIZE
if (st.st_blksize > 0)
size = st.st_blksize;
#endif
}
p = malloc(size);
if (__glibc_unlikely(p == NULL))
return EOF;
_IO_setb(fp, p, p + size, 1);
return 1;
}
libc_hidden_def(_IO_file_doallocate)

struct stat64是一个在UNIX和Linux系统中用于存储文件或目录元数据的数据结构。它的定义通常在头文件<sys/stat.h>中,并包含以下成员:

  1. dev:文件所在设备的设备号
  2. ino:文件的i节点号
  3. mode:文件的访问权限和类型(如普通文件、目录、符号链接等)
  4. nlink:文件的硬链接数
  5. uid:文件所有者的用户ID
  6. gid:文件所有者所在的组ID
  7. rdev:如果文件是设备文件,则为设备号
  8. size:文件大小(以字节为单位)
  9. blksize:文件系统块大小(以字节为单位)
  10. blocks:文件占用的块数(以文件系统块为单位)
  11. atime:文件上一次被访问的时间
  12. mtime:文件上一次修改的时间
  13. ctime:文件上一次状态改变的时间(如文件所有者或权限的改变

首先是_IO_SYSSTAT调用虚表中的__stat指向的_IO_file_stat函数

#define _IO_SYSSTAT(FP, BUF) JUMP1 (__stat, FP, BUF)

系统调用SYS_fstat修改st.blksize大小

image-20230331170443379

然后申请了一块内存,调用libio/genops.c中的**_IO_setb**

void
_IO_setb(_IO_FILE* f, char* b, char* eb, int a) {
if (f->_IO_buf_base && !(f->_flags & _IO_USER_BUF))
free(f->_IO_buf_base);
f->_IO_buf_base = b;
f->_IO_buf_end = eb;
if (a)
f->_flags &= ~_IO_USER_BUF;
else
f->_flags |= _IO_USER_BUF;
}
libc_hidden_def(_IO_setb)

如果f的_IO_buf_base存在,而且标志位的_IO_USER_BUF不是用户分配缓冲区,释放这个缓冲区

设置_IO_buf_base为刚刚malloc的堆为缓冲区,_IO_buf_end设置为堆块开始+堆块大小,并设置_flags为用户分配缓冲区

#define _IO_USER_BUF 1 /* User owns buffer; don’t delete it on close. */

_IO_USER_BUF用于指示文件流的缓冲区是由用户分配的还是由标准 I/O 库分配的。如果 _IO_USER_BUF 标志位未被设置,那么表示文件流的缓冲区是由标准 I/O 库分配的。在这种情况下,标准 I/O 库会在关闭文件流时自动释放缓冲区。如果 _IO_USER_BUF 标志位被设置,那么表示文件流的缓冲区是由用户分配的。在这种情况下,标准 I/O 库不会尝试释放缓冲区


往缓冲区里写入流指针所标记的文件数据,并把数据写入目标内存

回到_IO_file_xsgetn然后进行我们fread函数的第三个参数count次的循环

前面条件都不符合来到 if (fp->_IO_buf_base&& want < (size_t) (fp->_IO_buf_end - fp->_IO_buf_base))

want < (size_t) (fp->_IO_buf_end - fp->_IO_buf_base)时,即fread一次想要读取的数据小于4k

调用libio/genops.c __underflow

int
__underflow (_IO_FILE *fp)
{
#if defined _LIBC || defined _GLIBCPP_USE_WCHAR_T
if (_IO_vtable_offset (fp) == 0 && _IO_fwide (fp, -1) != -1)
return EOF;
#endif

if (fp->_mode == 0)
_IO_fwide (fp, -1);
if (_IO_in_put_mode (fp))
if (_IO_switch_to_get_mode (fp) == EOF)
return EOF;
if (fp->_IO_read_ptr < fp->_IO_read_end)
return *(unsigned char *) fp->_IO_read_ptr;
if (_IO_in_backup (fp))
{
_IO_switch_to_main_get_area (fp);
if (fp->_IO_read_ptr < fp->_IO_read_end)
return *(unsigned char *) fp->_IO_read_ptr;
}
if (_IO_have_markers (fp))
{
if (save_for_backup (fp, fp->_IO_read_end))
return EOF;
}
else if (_IO_have_backup (fp))
_IO_free_backup_area (fp);
return _IO_UNDERFLOW (fp);
}
libc_hidden_def (__underflow)

fp->_IO_read_ptr < fp->_IO_read_end检测缓冲区是否有数据,有数据可以直接返回读取

否则调用_IO_UNDERFLOW

#define _IO_UNDERFLOW(FP) JUMP0 (__underflow, FP)

即虚表中的__underflow字段,对应_IO_new_file_underflow函数

_IO_new_file_underflow (_IO_FILE *fp)
{
_IO_ssize_t count;
#if 0
/* SysV does not make this test; take it out for compatibility */
if (fp->_flags & _IO_EOF_SEEN)
return (EOF);
#endif

if (fp->_flags & _IO_NO_READS)//会先检测是否存在_IO_NO_READS标志,
{
fp->_flags |= _IO_ERR_SEEN;
__set_errno (EBADF);
return EOF;
}
if (fp->_IO_read_ptr < fp->_IO_read_end)//检测输入缓冲区里存在数据

return *(unsigned char *) fp->_IO_read_ptr;

if (fp->_IO_buf_base == NULL)
{
/* Maybe we already have a push back pointer. */
if (fp->_IO_save_base != NULL)
{
free (fp->_IO_save_base);
fp->_flags &= ~_IO_IN_BACKUP;
}
_IO_doallocbuf (fp);//如果没有输入缓冲区,则再次调用_IO_doallocbuf分配输入缓冲区
}

/* Flush all line buffered files before reading. */
/* FIXME This can/should be moved to genops ?? */
if (fp->_flags & (_IO_LINE_BUF|_IO_UNBUFFERED))
{
#if 0
_IO_flush_all_linebuffered ();
#else
/* We used to flush all line-buffered stream. This really isn't
required by any standard. My recollection is that
traditional Unix systems did this for stdout. stderr better
not be line buffered. So we do just that here
explicitly. --drepper */
_IO_acquire_lock (_IO_stdout);

if ((_IO_stdout->_flags & (_IO_LINKED | _IO_NO_WRITES | _IO_LINE_BUF))
== (_IO_LINKED | _IO_LINE_BUF))
_IO_OVERFLOW (_IO_stdout, EOF);

_IO_release_lock (_IO_stdout);
#endif
}

_IO_switch_to_get_mode (fp);
/* This is very tricky. We have to adjust those
pointers before we call _IO_SYSREAD () since
we may longjump () out while waiting for
input. Those pointers may be screwed up. H.J. */
fp->_IO_read_base = fp->_IO_read_ptr = fp->_IO_buf_base;
fp->_IO_read_end = fp->_IO_buf_base;
fp->_IO_write_base = fp->_IO_write_ptr = fp->_IO_write_end
= fp->_IO_buf_base;

count = _IO_SYSREAD (fp, fp->_IO_buf_base,
fp->_IO_buf_end - fp->_IO_buf_base);
if (count <= 0)
{
if (count == 0)
fp->_flags |= _IO_EOF_SEEN;
else
fp->_flags |= _IO_ERR_SEEN, count = 0;
}
fp->_IO_read_end += count;
if (count == 0)
{
/* If a stream is read to EOF, the calling application may switch active
handles. As a result, our offset cache would no longer be valid, so
unset it. */
fp->_offset = _IO_pos_BAD;
return EOF;
}
if (fp->_offset != _IO_pos_BAD)
_IO_pos_adjust (fp->_offset, count);
return *(unsigned char *) fp->_IO_read_ptr;
}

会先检测是否存在_IO_NO_READS标志,输入缓冲区里存在数据

如果没有输入缓冲区,则再次调用_IO_doallocbuf分配输入缓冲区

设置_IO_read_base,_IO_read_ptr,_IO_read_end,_IO_write_base,_IO_write_ptr,_IO_write_end都为filedoalloc.c中申请的堆的缓冲区

image-20230331225339043

调用**_IO_SYSREAD函数尝试从fp中读_IO_buf_end - _IO_buf_base**数据到_IO_buf_base

#define _IO_SYSREAD(FP, DATA, LEN) JUMP2 (__read, FP, DATA, LEN)

_IO_SYSREAD 调用虚表里的_/read指向的libio/fileops.c的_IO_file_read

_IO_file_read (_IO_FILE *fp, void *buf, _IO_ssize_t size)
{
return (__builtin_expect (fp->_flags2 & _IO_FLAGS2_NOTCANCEL, 0)
? read_not_cancel (fp->_fileno, buf, size)
: read (fp->_fileno, buf, size));
}
libc_hidden_def (_IO_file_read)

image-20230331230514508

把文件描述符_fileno代表的文件数据读入到_IO_buf_base

image-20230331230607785

移动_IO_read_end位置,来标记读入数据的终点

image-20230331231456814

再次回到_IO_file_xsgetn的while循环

这次会进入第一条分支

image-20230331232055813

通过memcpy拷贝内存到fopen函数的第一个参数buffer

image-20230331232240158

改变_IO_read_ptr指向区域

image-20230331232540181

一直循环到count为0

want > (size_t) (fp->_IO_buf_end - fp->_IO_buf_base)时,即fread一次想要读取的数据大于4k

_IO_setg (fp, fp->_IO_buf_base, fp->_IO_buf_base, fp->_IO_buf_base);
_IO_setp (fp, fp->_IO_buf_base, fp->_IO_buf_base);

/* Try to maintain alignment: read a whole number of blocks. */
count = want;
if (fp->_IO_buf_base)
{
_IO_size_t block_size = fp->_IO_buf_end - fp->_IO_buf_base;
if (block_size >= 128)
count -= want % block_size;
}

count = _IO_SYSREAD (fp, s, count);
if (count <= 0)
{
if (count == 0)
fp->_flags |= _IO_EOF_SEEN;
else
fp->_flags |= _IO_ERR_SEEN;

break;
}

s += count;
want -= count;
if (fp->_offset != _IO_pos_BAD)
_IO_pos_adjust (fp->_offset, count);

libio/libioP.h

#define _IO_setg(fp, eb, g, eg)  ((fp)->_IO_read_base = (eb),\
(fp)->_IO_read_ptr = (g), (fp)->_IO_read_end = (eg))
#define _IO_setp(__fp, __p, __ep) \
((__fp)->_IO_write_base = (__fp)->_IO_write_ptr \
= __p, (__fp)->_IO_write_end = (__ep))

macro展开是这样的

((fp)->_IO_read_base = (fp->_IO_buf_base), (fp)->_IO_read_ptr = (fp->_IO_buf_base), (fp)->_IO_read_end = (fp->_IO_buf_base));
((fp)->_IO_write_base = (fp)->_IO_write_ptr = fp->_IO_buf_base, (fp)->_IO_write_end = (fp->_IO_buf_base));

也是把结构体里的readwrite相关6个指针均初始化为**_IO_buf_base**

因为此时申请的缓冲区最大,一个页面4k,所以block_size为4k,调用_IO_SYSREAD,这一次不经过缓冲区,直接把数据放入我们fopen指定的第一个参数里,s += count;
want -= count;后,最后如果大小小于4k会和上面小于4k一样先进入缓冲区再读入第一个参数内存里

函数中调用的vtable函数

  • _IO_sgetn函数调用了vtable的_IO_file_xsgetn
  • _IO_doallocbuf函数调用了vtable的_IO_file_doallocate以初始化输入缓冲区。
  • vtable中的_IO_file_doallocate调用了vtable中的__GI__IO_file_stat以获取文件信息。
  • __underflow函数调用了vtable中的_IO_new_file_underflow实现文件数据读取。
  • vtable中的_IO_new_file_underflow调用了vtable__GI__IO_file_read最终去执行系统调用read。