从 C 文件中获取 char* 的最快方法
Fastest way to get char* from file in C
我有获取参数的函数:char* filedata
(存储整个文件)和FILE *fp
(打开的文件)。
void read_file(char *filedata, FILE *fp){
char buffer[1000];
while(fgets(buffer, sizeof(buffer), fp))
{
char *new_str;
if((new_str = malloc(strlen(filedata) + strlen(buffer)+1)) != NULL)
{
new_str[0] = '[=10=]'; // ensures the memory is an empty string
strcat(new_str, filedata);
strcat(new_str, buffer);
}
else
{
printf("malloc failed!\n");
}
strcpy(filedata, new_str);
}
fclose(fp);
}
但这并不太快...有没有更快的方法来读取整个文件?
需要注意的是,您可以使用 fread()
function.
将整个文件一次性读入 appropriately-sized 缓冲区
以下代码概述了如何打开文件、确定其大小、分配该大小的缓冲区,然后将文件的数据(全部)读入该缓冲区。但请注意有关 fseek
和 ftell
函数的注意事项(稍后讨论):
#include <stdio.h>
#include <stdlib.h>
int main(void)
{
char* filename = "MyFile.txt"; // Or whatever
FILE* fp = fopen(filename, "rb"); // Open in binary mode
int seek = fseek(fp, 0, SEEK_END); // CAVEAT: Files in BINARY mode may not support SEEK_END ...
if (seek != 0) {
printf("Cannot fseek on binary file!\n");
fclose(fp);
return 1;
}
size_t filesize = (size_t)ftell(fp); // ... but this is not reliable if opened in TEXT mode!
char* filedata = calloc(filesize + 1, 1); // Add 1 for the terminating "nul" character
rewind(fp);
fread(filedata, 1, filesize, fp); // Read whole file
// Clean up ...
fclose(fp);
free(filedata);
return 0;
}
注意事项:
请注意,以二进制模式打开的文件(如我在 fopen()
调用中提供的 "rb"
模式参数)不需要 来支持SEEK_END
起源于对 fseek()
的调用;如果您的平台是这种情况,则 this answer offers some alternatives to determine the file's size. From cppreference:
… Binary streams are not required to support SEEK_END, in
particular if additional null bytes are output.
然而,另一方面,以文本模式打开文件(使用 "rt"
)将使对 ftell
的调用实际上毫无意义,就输入缓冲区所需的大小而言,指定给 fread
的值; from cppreference:
If the stream is open in text mode, the value returned by this
function is unspecified and is only meaningful as the input to
fseek().
另请注意,正如评论中所指出的,如果文件大小大于可存储在 long int
变量;要处理此类情况,您可以使用 (platform-dependent) 64 位等效项,如我在 .
中所述
注释您的函数(不提及泄漏等)并计算字符缓冲区上的操作:
void read_file(char *filedata, FILE *fp){
char buffer[1000];
while(fgets(buffer, sizeof(buffer), fp)) // <<-- NEW_SIZE
{
char *new_str;
if((new_str = malloc(strlen(filedata) // <<-- OLD_SIZE
+ strlen(buffer) // <<-- NEW_SIZE
+1)) != NULL)
{
new_str[0] = '[=10=]'; // ensures the memory is an empty string
strcat(new_str, filedata); // <<-- OLD_SIZE
strcat(new_str, buffer); // <<-- OLD_SIZE + NEW_SIZE
}
else
{
printf("malloc failed!\n");
}
strcpy(filedata, new_str); // <<-- OLD_SIZE + NEW_SIZE
}
fclose(fp);
}
fgets()
。 strlen()
、strcat()
和 strcpy()
都需要循环一个字符缓冲区。
只有fgets()
是真正需要的,其余的可以避免复制。
添加通过缓冲区的次数:
每个循环的操作总和:4 * OLD_SIZE + 4 * NEW_SIZE
并且:请记住,OLD_SIZE 实际上是递归的 SUM(NEW_SIZE),因此您的函数具有
QUADRATIC 行为 wrt 循环迭代的次数。(基本上是读取的行数)
所以你最终得到:
Number of times a character is inspected
= 4 * N_LINE * LINE_SIZE
+ 8 * (NLINE * (NLINE-1) ) * LINE_SIZE
;
,这意味着对于一个 100 行的文件,您需要大约 40K 遍历字符串。
[这是“画家施莱米尔”的故事]
下面是我的函数,说明了我通常是如何做的。不确定与所有其他可能的 C 实现相比它有多快。但我认为它们都非常相似,除非以某种方式编程不当,这可能会导致执行速度变慢、效率降低。
/* ==========================================================================
* Function: readfile ( FILE *ptr, int *nbytes )
* Purpose: read open file ptr into internal buffer
* --------------------------------------------------------------------------
* Arguments: ptr (I) FILE * to already open (via fopen)
* file, whose contents are to be read
* nbytes (O) int * returning #bytes in returned buffer
* --------------------------------------------------------------------------
* Returns: ( unsigned char * ) buffer with ptr's contents
* --------------------------------------------------------------------------
* Notes: o caller should free() returned output buffer ptr when finished
* ======================================================================= */
/* --- entry point --- */
unsigned char *readfile ( FILE *ptr, int *nbytes ) {
/* ---
* allocations and declarations
* ------------------------------- */
unsigned char *outbuff = NULL; /* malloc'ed and realloc'ed below */
int allocsz=0, reallocsz=500000, /*total #bytes allocated, #realloc */
blksz=9900, nread=0, /* #bytes to read, #actually read */
buffsz = 0; /* total #bytes in buffer */
/* ---
* collect all bytes from ptr
* ----------------------------- */
if ( ptr != NULL ) { /* return NULL error if no input */
while ( 1 ) { /* read all input from file */
if ( buffsz+blksz + 99 >= allocsz ) { /* first realloc more memory */
allocsz += reallocsz; /*add reallocsz to current allocation*/
if ( (outbuff=realloc(outbuff,allocsz)) == NULL ) /* reallocate */
goto end_of_job; } /* quit with NULL ptr if failed */
nread = fread(outbuff+buffsz,1,blksz,ptr); /* read next block */
if ( nread < 1 ) break; /* all done, nothing left to read */
buffsz += nread; /* add #bytes from current block */
} /* --- end-of-while(1) --- */
fclose(ptr); /* close fopen()'ed file ptr */
} /* --- end-of-if(ptr!=NULL) --- */
end_of_job:
if ( nbytes != NULL ) *nbytes = buffsz; /* #bytes in outbuff */
return ( outbuff ); /* back to caller with output or NULL*/
} /* --- end-of-function readfile() --- */
我有获取参数的函数:char* filedata
(存储整个文件)和FILE *fp
(打开的文件)。
void read_file(char *filedata, FILE *fp){
char buffer[1000];
while(fgets(buffer, sizeof(buffer), fp))
{
char *new_str;
if((new_str = malloc(strlen(filedata) + strlen(buffer)+1)) != NULL)
{
new_str[0] = '[=10=]'; // ensures the memory is an empty string
strcat(new_str, filedata);
strcat(new_str, buffer);
}
else
{
printf("malloc failed!\n");
}
strcpy(filedata, new_str);
}
fclose(fp);
}
但这并不太快...有没有更快的方法来读取整个文件?
需要注意的是,您可以使用 fread()
function.
以下代码概述了如何打开文件、确定其大小、分配该大小的缓冲区,然后将文件的数据(全部)读入该缓冲区。但请注意有关 fseek
和 ftell
函数的注意事项(稍后讨论):
#include <stdio.h>
#include <stdlib.h>
int main(void)
{
char* filename = "MyFile.txt"; // Or whatever
FILE* fp = fopen(filename, "rb"); // Open in binary mode
int seek = fseek(fp, 0, SEEK_END); // CAVEAT: Files in BINARY mode may not support SEEK_END ...
if (seek != 0) {
printf("Cannot fseek on binary file!\n");
fclose(fp);
return 1;
}
size_t filesize = (size_t)ftell(fp); // ... but this is not reliable if opened in TEXT mode!
char* filedata = calloc(filesize + 1, 1); // Add 1 for the terminating "nul" character
rewind(fp);
fread(filedata, 1, filesize, fp); // Read whole file
// Clean up ...
fclose(fp);
free(filedata);
return 0;
}
注意事项:
请注意,以二进制模式打开的文件(如我在 fopen()
调用中提供的 "rb"
模式参数)不需要 来支持SEEK_END
起源于对 fseek()
的调用;如果您的平台是这种情况,则 this answer offers some alternatives to determine the file's size. From cppreference:
… Binary streams are not required to support SEEK_END, in particular if additional null bytes are output.
然而,另一方面,以文本模式打开文件(使用 "rt"
)将使对 ftell
的调用实际上毫无意义,就输入缓冲区所需的大小而言,指定给 fread
的值; from cppreference:
If the stream is open in text mode, the value returned by this function is unspecified and is only meaningful as the input to fseek().
另请注意,正如评论中所指出的,如果文件大小大于可存储在 long int
变量;要处理此类情况,您可以使用 (platform-dependent) 64 位等效项,如我在
注释您的函数(不提及泄漏等)并计算字符缓冲区上的操作:
void read_file(char *filedata, FILE *fp){
char buffer[1000];
while(fgets(buffer, sizeof(buffer), fp)) // <<-- NEW_SIZE
{
char *new_str;
if((new_str = malloc(strlen(filedata) // <<-- OLD_SIZE
+ strlen(buffer) // <<-- NEW_SIZE
+1)) != NULL)
{
new_str[0] = '[=10=]'; // ensures the memory is an empty string
strcat(new_str, filedata); // <<-- OLD_SIZE
strcat(new_str, buffer); // <<-- OLD_SIZE + NEW_SIZE
}
else
{
printf("malloc failed!\n");
}
strcpy(filedata, new_str); // <<-- OLD_SIZE + NEW_SIZE
}
fclose(fp);
}
fgets()
。 strlen()
、strcat()
和 strcpy()
都需要循环一个字符缓冲区。
只有fgets()
是真正需要的,其余的可以避免复制。
添加通过缓冲区的次数:
每个循环的操作总和:4 * OLD_SIZE + 4 * NEW_SIZE 并且:请记住,OLD_SIZE 实际上是递归的 SUM(NEW_SIZE),因此您的函数具有 QUADRATIC 行为 wrt 循环迭代的次数。(基本上是读取的行数)
所以你最终得到:
Number of times a character is inspected
= 4 * N_LINE * LINE_SIZE
+ 8 * (NLINE * (NLINE-1) ) * LINE_SIZE
;
,这意味着对于一个 100 行的文件,您需要大约 40K 遍历字符串。
[这是“画家施莱米尔”的故事]
下面是我的函数,说明了我通常是如何做的。不确定与所有其他可能的 C 实现相比它有多快。但我认为它们都非常相似,除非以某种方式编程不当,这可能会导致执行速度变慢、效率降低。
/* ==========================================================================
* Function: readfile ( FILE *ptr, int *nbytes )
* Purpose: read open file ptr into internal buffer
* --------------------------------------------------------------------------
* Arguments: ptr (I) FILE * to already open (via fopen)
* file, whose contents are to be read
* nbytes (O) int * returning #bytes in returned buffer
* --------------------------------------------------------------------------
* Returns: ( unsigned char * ) buffer with ptr's contents
* --------------------------------------------------------------------------
* Notes: o caller should free() returned output buffer ptr when finished
* ======================================================================= */
/* --- entry point --- */
unsigned char *readfile ( FILE *ptr, int *nbytes ) {
/* ---
* allocations and declarations
* ------------------------------- */
unsigned char *outbuff = NULL; /* malloc'ed and realloc'ed below */
int allocsz=0, reallocsz=500000, /*total #bytes allocated, #realloc */
blksz=9900, nread=0, /* #bytes to read, #actually read */
buffsz = 0; /* total #bytes in buffer */
/* ---
* collect all bytes from ptr
* ----------------------------- */
if ( ptr != NULL ) { /* return NULL error if no input */
while ( 1 ) { /* read all input from file */
if ( buffsz+blksz + 99 >= allocsz ) { /* first realloc more memory */
allocsz += reallocsz; /*add reallocsz to current allocation*/
if ( (outbuff=realloc(outbuff,allocsz)) == NULL ) /* reallocate */
goto end_of_job; } /* quit with NULL ptr if failed */
nread = fread(outbuff+buffsz,1,blksz,ptr); /* read next block */
if ( nread < 1 ) break; /* all done, nothing left to read */
buffsz += nread; /* add #bytes from current block */
} /* --- end-of-while(1) --- */
fclose(ptr); /* close fopen()'ed file ptr */
} /* --- end-of-if(ptr!=NULL) --- */
end_of_job:
if ( nbytes != NULL ) *nbytes = buffsz; /* #bytes in outbuff */
return ( outbuff ); /* back to caller with output or NULL*/
} /* --- end-of-function readfile() --- */