运行时指针变化
Pointer changes at runtime
我有以下文件 test.c
:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#define LEXER_INC(l) ( (l)->pos++ )
#define isCidstart(c) (isalpha(c) || (c)=='_')
#define isCident(c) (isalnum(c) || (c)=='_')
typedef struct LexerState
{
const char *fileName;
const char *sourceText;
int sourceLength;
const char *pos;
const char *end;
int line;
} LexerState;
typedef enum LexerToken
{
TokenHalt,
TokenPush,
TokenPop,
TokenEndOfLine,
TokenEOF,
TokenNone
} LexerToken;
typedef struct ReservedWord
{
const char *word;
LexerToken token;
} ReservedWord;
static ReservedWord reservedWords[] =
{
{"halt", TokenHalt},
{"push", TokenPush},
{"pop", TokenPop}
};
void lexerInit(LexerState *lexer)
{
lexer->fileName = "test.s";
lexer->sourceText = "pop\r\npush\r\nhalt\r\n"; // read from file
lexer->sourceLength = strlen(lexer->sourceText); // 17
lexer->pos = lexer->sourceText; // pointing to first char of lexer->sourceText
lexer->end = lexer->sourceText + lexer->sourceLength; // end of lexer->sourceText string, i.e. '[=10=]'
lexer->line = 1;
}
LexerToken lexerCheckReservedWord(const char *word)
{
for (int count = 0; count < (sizeof(reservedWords) / sizeof(ReservedWord)); count++)
{
if (!strcmp(word, reservedWords[count].word))
return reservedWords[count].token;
}
return TokenNone;
}
LexerToken lexerGetWord(LexerState *lexer)
{
const char *startPos = lexer->pos;
LexerToken token = TokenNone;
char *word;
int len;
do
{
LEXER_INC(lexer);
}
while (lexer->pos != lexer->end && isCident(*lexer->pos));
len = lexer->pos - startPos;
word = malloc(len + 1); /* (len + 1) for '[=10=]' ending */
strncpy(word, startPos, len);
word[len] = '[=10=]';
token = lexerCheckReservedWord(word);
return token;
}
LexerToken lexerGetToken(LexerState *lexer)
{
char thisChar;
/* Skip white characters */
while (lexer->pos != lexer->end && isspace(*lexer->pos))
{
if (*lexer->pos == '\n')
{
/* New line found */
lexer->line++;
LEXER_INC(lexer);
return TokenEndOfLine;
}
LEXER_INC(lexer);
}
if (lexer->pos == lexer->end || *lexer->pos == '[=10=]')
{
return TokenEOF;
}
thisChar = *lexer->pos;
if (isCidstart(thisChar))
{
return lexerGetWord(lexer);
}
return TokenEOF;
}
int main(int argc, const char *argv[])
{
LexerState *lexer;
LexerToken token;
lexer = malloc(sizeof(lexer));
lexerInit(lexer);
while ((token = lexerGetToken(lexer)) != TokenEOF)
{
printf("token %d\n", token);
}
return EXIT_SUCCESS;
}
当使用 gdb
函数 lexerGetWord
调试时,我意识到在执行行 word = malloc((len + 1) * sizeof(char));
之后,lexer->pos
指针地址更改为某个数字(在我的例子中0x23
):
82 len = lexer->pos - startPos;
(gdb) next
83 word = malloc(len + 1); // (len + 1) for '[=11=]' ending
(gdb) print len
= 3
(gdb) print lexer->pos
= 0x60003b1b3 "\r\npush\r\nhalt\r\n"
这里lexer->pos
从字符串lexer->sourceText
("pop\r\npush\r\nhalt\r\n"
)
中得到单词"pop"
后指向这个地址
(gdb) next
84 strncpy(word, startPos, len);
(gdb) print lexer->pos
= 0x23 <error: Cannot access memory at address 0x23>
但是在malloc
执行后,指针改变了它的地址,导致后来Segmentation fault
。
$ gcc test.c -o test
$ ./test
token 2
Segmentation fault (core dumped)
我做错了什么?
编辑
这只发生在 Cygwin gcc 中。我尝试了 MinGW,一切正常。
$ uname -a
CYGWIN_NT-6.3 Stepan 2.9.0(0.318/5/3) 2017-09-12 10:18 x86_64 Cygwin
问题在main
:
lexer = malloc(sizeof(lexer));
您为 指针 分配了足够的 space,而不是它指向的内容。结果,您最终写入了已分配内存的末尾。这会调用 undefined behavior.
你需要 space 因为 lexer
指向:
lexer = malloc(sizeof(*lexer));
我有以下文件 test.c
:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#define LEXER_INC(l) ( (l)->pos++ )
#define isCidstart(c) (isalpha(c) || (c)=='_')
#define isCident(c) (isalnum(c) || (c)=='_')
typedef struct LexerState
{
const char *fileName;
const char *sourceText;
int sourceLength;
const char *pos;
const char *end;
int line;
} LexerState;
typedef enum LexerToken
{
TokenHalt,
TokenPush,
TokenPop,
TokenEndOfLine,
TokenEOF,
TokenNone
} LexerToken;
typedef struct ReservedWord
{
const char *word;
LexerToken token;
} ReservedWord;
static ReservedWord reservedWords[] =
{
{"halt", TokenHalt},
{"push", TokenPush},
{"pop", TokenPop}
};
void lexerInit(LexerState *lexer)
{
lexer->fileName = "test.s";
lexer->sourceText = "pop\r\npush\r\nhalt\r\n"; // read from file
lexer->sourceLength = strlen(lexer->sourceText); // 17
lexer->pos = lexer->sourceText; // pointing to first char of lexer->sourceText
lexer->end = lexer->sourceText + lexer->sourceLength; // end of lexer->sourceText string, i.e. '[=10=]'
lexer->line = 1;
}
LexerToken lexerCheckReservedWord(const char *word)
{
for (int count = 0; count < (sizeof(reservedWords) / sizeof(ReservedWord)); count++)
{
if (!strcmp(word, reservedWords[count].word))
return reservedWords[count].token;
}
return TokenNone;
}
LexerToken lexerGetWord(LexerState *lexer)
{
const char *startPos = lexer->pos;
LexerToken token = TokenNone;
char *word;
int len;
do
{
LEXER_INC(lexer);
}
while (lexer->pos != lexer->end && isCident(*lexer->pos));
len = lexer->pos - startPos;
word = malloc(len + 1); /* (len + 1) for '[=10=]' ending */
strncpy(word, startPos, len);
word[len] = '[=10=]';
token = lexerCheckReservedWord(word);
return token;
}
LexerToken lexerGetToken(LexerState *lexer)
{
char thisChar;
/* Skip white characters */
while (lexer->pos != lexer->end && isspace(*lexer->pos))
{
if (*lexer->pos == '\n')
{
/* New line found */
lexer->line++;
LEXER_INC(lexer);
return TokenEndOfLine;
}
LEXER_INC(lexer);
}
if (lexer->pos == lexer->end || *lexer->pos == '[=10=]')
{
return TokenEOF;
}
thisChar = *lexer->pos;
if (isCidstart(thisChar))
{
return lexerGetWord(lexer);
}
return TokenEOF;
}
int main(int argc, const char *argv[])
{
LexerState *lexer;
LexerToken token;
lexer = malloc(sizeof(lexer));
lexerInit(lexer);
while ((token = lexerGetToken(lexer)) != TokenEOF)
{
printf("token %d\n", token);
}
return EXIT_SUCCESS;
}
当使用 gdb
函数 lexerGetWord
调试时,我意识到在执行行 word = malloc((len + 1) * sizeof(char));
之后,lexer->pos
指针地址更改为某个数字(在我的例子中0x23
):
82 len = lexer->pos - startPos;
(gdb) next
83 word = malloc(len + 1); // (len + 1) for '[=11=]' ending
(gdb) print len
= 3
(gdb) print lexer->pos
= 0x60003b1b3 "\r\npush\r\nhalt\r\n"
这里lexer->pos
从字符串lexer->sourceText
("pop\r\npush\r\nhalt\r\n"
)
"pop"
后指向这个地址
(gdb) next
84 strncpy(word, startPos, len);
(gdb) print lexer->pos
= 0x23 <error: Cannot access memory at address 0x23>
但是在malloc
执行后,指针改变了它的地址,导致后来Segmentation fault
。
$ gcc test.c -o test
$ ./test
token 2
Segmentation fault (core dumped)
我做错了什么?
编辑
这只发生在 Cygwin gcc 中。我尝试了 MinGW,一切正常。
$ uname -a
CYGWIN_NT-6.3 Stepan 2.9.0(0.318/5/3) 2017-09-12 10:18 x86_64 Cygwin
问题在main
:
lexer = malloc(sizeof(lexer));
您为 指针 分配了足够的 space,而不是它指向的内容。结果,您最终写入了已分配内存的末尾。这会调用 undefined behavior.
你需要 space 因为 lexer
指向:
lexer = malloc(sizeof(*lexer));