IO_FILE

最初学习c语言的时候,我们经常会用fopen打开一些文件来操作

FILE *fopen(const char *filename, const char *mode)

返回一个FILE文件指针,这个FILE到底是什么呢?他的底层数据结构什么样子?都有什么用呢?我们经常用的io函数又和这个有什么关系呢?

What is FILE??

看一下源码 libio/libioP.h

typedef struct _IO_FILE FILE;

_IO_FILE

struct _IO_FILE {
int _flags; /* High-order word is _IO_MAGIC; rest is flags. */
#define _IO_file_flags _flags

/* The following pointers correspond to the C++ streambuf protocol. */
/* Note: Tk uses the _IO_read_ptr and _IO_read_end fields directly. */
char* _IO_read_ptr; /* Current read pointer */
char* _IO_read_end; /* End of get area. */
char* _IO_read_base; /* Start of putback+get area. */
char* _IO_write_base; /* Start of put area. */
char* _IO_write_ptr; /* Current put pointer. */
char* _IO_write_end; /* End of put area. */
char* _IO_buf_base; /* Start of reserve area. */
char* _IO_buf_end; /* End of reserve area. */
/* The following fields are used to support backing up and undo. */
char *_IO_save_base; /* Pointer to start of non-current get area. */
char *_IO_backup_base; /* Pointer to first valid character of backup area */
char *_IO_save_end; /* Pointer to end of non-current get area. */

struct _IO_marker *_markers;

struct _IO_FILE *_chain;

int _fileno;
#if 0
int _blksize;
#else
int _flags2;
#endif
_IO_off_t _old_offset; /* This used to be _offset but it's too small. */

#define __HAVE_COLUMN /* temporary */
/* 1+column number of pbase(); 0 is unknown. */
unsigned short _cur_column;
signed char _vtable_offset;
char _shortbuf[1];

/* char* _save_gptr; char* _save_egptr; */

_IO_lock_t *_lock;
#ifdef _IO_USE_OLD_IO_FILE
};

struct _IO_FILE_complete
{
struct _IO_FILE _file;
#endif
#if defined _G_IO_IO_FILE_VERSION && _G_IO_IO_FILE_VERSION == 0x20001
_IO_off64_t _offset;
# if defined _LIBC || defined _GLIBCPP_USE_WCHAR_T
/* Wide character stream stuff. */
struct _IO_codecvt *_codecvt;
struct _IO_wide_data *_wide_data;
struct _IO_FILE *_freeres_list;
void *_freeres_buf;
# else
void *__pad1;
void *__pad2;
void *__pad3;
void *__pad4;
# endif
size_t __pad5;
int _mode;
/* Make sure we don't get into trouble again. */
char _unused2[15 * sizeof (int) - 4 * sizeof (void *) - sizeof (size_t)];
#endif
};

为宽字符准备的结构体 defined _GLIBCPP_USE_WCHAR_T

struct _IO_wide_data
{
wchar_t *_IO_read_ptr; /* Current read pointer */
wchar_t *_IO_read_end; /* End of get area. */
wchar_t *_IO_read_base; /* Start of putback+get area. */
wchar_t *_IO_write_base; /* Start of put area. */
wchar_t *_IO_write_ptr; /* Current put pointer. */
wchar_t *_IO_write_end; /* End of put area. */
wchar_t *_IO_buf_base; /* Start of reserve area. */
wchar_t *_IO_buf_end; /* End of reserve area. */
/* The following fields are used to support backing up and undo. */
wchar_t *_IO_save_base; /* Pointer to start of non-current get area. */
wchar_t *_IO_backup_base; /* Pointer to first valid character of
backup area */
wchar_t *_IO_save_end; /* Pointer to end of non-current get area. */
__mbstate_t _IO_state;
__mbstate_t _IO_last_state;
struct _IO_codecvt _codecvt;
wchar_t _shortbuf[1];
const struct _IO_jump_t *_wide_vtable;
};

_flags字段

/* Magic numbers and bits for the _flags field.
The magic numbers use the high-order bits of _flags;
the remaining bits are available for variable flags.
Note: The magic numbers must all be negative if stdio
emulation is desired. */

#define _IO_MAGIC 0xFBAD0000 /* Magic number */
#define _OLD_STDIO_MAGIC 0xFABC0000 /* Emulate old stdio. */
#define _IO_MAGIC_MASK 0xFFFF0000
#define _IO_USER_BUF 1 /* User owns buffer; don't delete it on close. */
#define _IO_UNBUFFERED 2
#define _IO_NO_READS 4 /* Reading not allowed */
#define _IO_NO_WRITES 8 /* Writing not allowd */
#define _IO_EOF_SEEN 0x10
#define _IO_ERR_SEEN 0x20
#define _IO_DELETE_DONT_CLOSE 0x40 /* Don't call close(_fileno) on cleanup. */
#define _IO_LINKED 0x80 /* Set if linked (using _chain) to streambuf::_list_all.*/
#define _IO_IN_BACKUP 0x100
#define _IO_LINE_BUF 0x200
#define _IO_TIED_PUT_GET 0x400 /* Set if put and get pointer logicly tied. */
#define _IO_CURRENTLY_PUTTING 0x800
#define _IO_IS_APPENDING 0x1000
#define _IO_IS_FILEBUF 0x2000
#define _IO_BAD_SEEN 0x4000
#define _IO_USER_LOCK 0x8000

_flag的高两位字节是由libc固定的Magic number

低两位字节的位数规则决定了程序的执行状态

_mode 字段

  • _IOFBF(0):文件流是全缓冲模式。
  • _IOLBF(1):文件流是行缓冲模式。
  • _IONBF(2):文件流是无缓冲模式。
  • _IOREADING(3):文件流当前正在进行读操作。
  • _IOWRITING(4):文件流当前正在进行写操作。
  • _IOAPPEND(8):文件流是以追加模式打开的。
  • _IOEOF(16):文件流已经到达了文件的末尾。
  • _IOERR(32):文件流出现了错误。
  • _IOSTRG(64):文件流是一个字符串流。
  • _IORW(128):文件流是读写模式。

为什么要有输入输出缓冲区?

主要原因是为了提高程序的性能。输入输出操作通常比内存操作慢得多,因为它们涉及到从硬盘、网络等设备读取或写入数据。使用缓冲区可以减少实际的IO操作次数,从而提高程序的效率。

考虑IO性能和CPU性能的差距,会缓存一段buffer,这段buffer满或者外部触发时就可以出发写入或者读出操作了。

FILE 在 Linux 系统的标准 IO 库中是用于描述文件的结构,称为文件流。

FILE 结构在程序执行 fopen 等函数时会进行创建,并分配在堆中。

在标准 I/O 库中,每个程序启动时有三个文件流是自动打开的:stdin、stdout、stderr在libc.so的数据段的

_IO_2_1_stderr_
_IO_2_1_stdout_
_IO_2_1_stdin_

进程里用_IO_FILE._chain构成一个单向链表,链表头部在libc全局变量_IO_list_all

_IO_list_all_stamp是一个全局变量,它是用于记录所有打开的流(stream)的时间戳(timestamp)的。

extern struct _IO_FILE_plus *_IO_list_all;

初始时单链表是这样的

_IO_list_all-->_IO_2_1_stderr_-->_IO_2_1_stdout_-->_IO_2_1_stdin_

_IO_FILE_plus

image-20230329194516352

可以用p *(struct _IO_FILE_plus *) addr来打印

可以看出实际应用中我们使用时是 _IO_FILE_plus结构体

struct _IO_FILE_plus
{
_IO_FILE file;
IO_jump_t *vtable;//32 位的 vtable 偏移为 0x94,64 位偏移为 0xd8
}

偏移

0x0   _flags
0x8 _IO_read_ptr
0x10 _IO_read_end
0x18 _IO_read_base
0x20 _IO_write_base
0x28 _IO_write_ptr
0x30 _IO_write_end
0x38 _IO_buf_base
0x40 _IO_buf_end
0x48 _IO_save_base
0x50 _IO_backup_base
0x58 _IO_save_end
0x60 _markers
0x68 _chain
0x70 _fileno
0x74 _flags2
0x78 _old_offset
0x80 _cur_column
0x82 _vtable_offset
0x83 _shortbuf
0x88 _lock
0x90 _offset
0x98 _codecvt
0xa0 _wide_data
0xa8 _freeres_list
0xb0 _freeres_buf
0xb8 __pad5
0xc0 _mode
0xc4 _unused2
0xd8 vtable

IO_jump_t * vtable;虚表,一下子联想到c++虚函数底层原理

_IO_jump_t中保存了一些可以跳转的函数指针,标准 IO 函数需要文件流指针指引去调用虚表函数

struct _IO_jump_t
{
JUMP_FIELD(size_t, __dummy);
JUMP_FIELD(size_t, __dummy2);
JUMP_FIELD(_IO_finish_t, __finish);
JUMP_FIELD(_IO_overflow_t, __overflow);
JUMP_FIELD(_IO_underflow_t, __underflow);
JUMP_FIELD(_IO_underflow_t, __uflow);
JUMP_FIELD(_IO_pbackfail_t, __pbackfail);
/* showmany */
JUMP_FIELD(_IO_xsputn_t, __xsputn);
JUMP_FIELD(_IO_xsgetn_t, __xsgetn);
JUMP_FIELD(_IO_seekoff_t, __seekoff);
JUMP_FIELD(_IO_seekpos_t, __seekpos);
JUMP_FIELD(_IO_setbuf_t, __setbuf);
JUMP_FIELD(_IO_sync_t, __sync);
JUMP_FIELD(_IO_doallocate_t, __doallocate);
JUMP_FIELD(_IO_read_t, __read);
JUMP_FIELD(_IO_write_t, __write);
JUMP_FIELD(_IO_seek_t, __seek);
JUMP_FIELD(_IO_close_t, __close);
JUMP_FIELD(_IO_stat_t, __stat);
JUMP_FIELD(_IO_showmanyc_t, __showmanyc);
JUMP_FIELD(_IO_imbue_t, __imbue);
#if 0
get_column;
set_column;
#endif
};
image-20230329195951585

fp是用fopen打开的一个文件,可以看到实际已经变成_IO_FILE_puls加入链表,这里的stdin、stdout、stderr,fp的vtable指向_IO_jump_t类型结构体_IO_file_jumps

image-20230329200421200

printf/puts 最终会调用_IO_file_xsputn

fclose 最终会调用_IO_FILE_FINISH

fwrite 最终会调用_IO_file_xsputn

fread 最终会调用_IO_file_xsgetn

scanf/gets 最终会调用_IO_file_xsgetn

先不做源码分析,先单步看一下printf函数

#include <stdio.h>
int main() {
printf("hello world");//梦开始的地方,呜呜
}

image-20230330001602270

_IO_file_xsputn

image-20230329233744967

_IO_file_overflow

image-20230329233840601

_IO_doallocbuf

image-20230329233956755

_IO_file_doallocate

image-20230329234222565

_IO_file_stat

image-20230329234850989

__fxstat64

image-20230329235004997

malloc@plt

image-20230329235217971

_IO_setb

image-20230329235450171

_IO_do_write

image-20230329235615761

_IO_default_xsputn

image-20230329235927623

_IO_file_overflow

image-20230330000048279

下面会循环IO_file_overflow,这个调用就是一个字节一个字节往我们malloc的堆块(缓冲区)里写我们的输出

image-20230330001359527

最后会调用我们的write做系统调用输出

image-20230330001235259

WTF