如何从缓冲区中逐字读取仅由“：”分隔的单词？

Question

我正在制作一个语言翻译器，想从缓冲区中逐字读取并将它们存储在键值结构中。

缓冲区包含这样一个文件：

hola:hello
que:what

等等。我已经尝试了一切，但我保留了诸如 segmentation fault: 11 之类的错误，或者只是一次又一次地读取同一行。

struct key_value{
char *key;
char *value;
};

...

struct key_value *kv = malloc(sizeof(struct key_value) * count);
char k[20]; //key
char v[20]; //value
int x = 0;
for(i = 0; i < numbytes; i++){
    sscanf(buffer,"%21[^:]:%21[^\n]\n",k,v);
    (kv + i)->key = k;
    (kv + i)->value = v;
}

for(i = 0; i < count; i++){
    printf("key:  %s, value:  %s\n",(kv + i)->key,(kv + i)->value);
}

free(buffer);
free(kv);

我希望输出为 key: hola, value: hello key: que, value: what，但实际输出只是 key: hola, value: hello 一次又一次。

哪种方法正确？

Answer 1

您的代码存在多个问题，其中

在每次循环迭代中，您从缓冲区的 beginning 读取。那么很自然，每次迭代都会提取相同的键和值。
更一般地说，你的读取循环迭代变量似乎与读取的数据没有关系。它似乎是一个 per-byte 迭代，但你似乎想要一个 per-line 迭代。您可能需要查看 scanf 的 %n 指令以帮助您跟踪缓冲区的进度。
您将每个键/值对扫描到相同的本地 k 和 v 变量中，然后将指向这些变量的指针分配给您的结构。结果指针都是一样的，和它们会在函数return时失效。我建议为其成员提供 structkey_value` 数组而不是指针，并将数据复制到其中。
您的 sscanf 格式为键和值分别读取最多 21 个字符，但提供的目标数组不够长。您需要将它们标注为至少 22 个字符以容纳 21 个字符和一个字符串终止符。
您的 sscanf() 格式和用法不支持识别格式错误的输入，尤其是超长的键或值。您需要检查 return 值，并且您可能需要将尾随换行符与 %c 字段匹配（格式中的文字换行符并不代表您认为的意思）。

使用 strtok_r 或 strtok 甚至 strchr 而不是 sscanf() 对（整个缓冲区）进行标记对您来说可能更容易。

此外，样式说明：您的 (kv + i)->key 形式的表达式是有效的，但写成 kv[i].key.

会更惯用

Answer 2

我编写了一段简单的代码，可以帮助您解决问题。我使用函数 fgets 从名为 "file.txt" 的文件中读取，并使用函数 strchr 来区分第一次出现的分隔符 ':'.

这里是代码：

#include <stdio.h>
#include <string.h>
#include <errno.h>

#define MAX_LINE_SIZE       256
#define MAX_DECODED_LINE    1024

struct decod {
    char key[MAX_LINE_SIZE];
    char value[MAX_DECODED_LINE];
};

static struct decod decod[1024];

int main(void)
{
    FILE * fptr = NULL;

    char fbuf[MAX_LINE_SIZE];
    char * value;
    int cnt=0,i;

    if ( !(fptr=fopen("file.txt","r")) )
    {
        perror("");
        return errno;
    }

    while( fgets(fbuf,MAX_LINE_SIZE,fptr)) {
        // Eliminate UNIX/DOS line terminator
        value=strrchr(fbuf,'\n');
        if (value) *value=0;
        value=strrchr(fbuf,'\r');
        if (value) *value=0;

        //Find first occurrence of the separator ':'
        value=strchr(fbuf,':');
        if (value) {
            // Truncates fbuf string to first word
            // and (++) points second word
            *value++=0;
        }

        if (cnt<MAX_DECODED_LINE) {
            strcpy(decod[cnt].key,fbuf);
            if (value!=NULL) {
                strcpy(decod[cnt].value,value);
            } else {
                decod[cnt].value[0]=0;
            }
            cnt++;
        } else {
            fprintf(stderr,
                 "Cannot read more than %d lines\n", MAX_DECODED_LINE);
            break;
        }
    }

    if (fptr)
        fclose(fptr);

    for(i=0;i<cnt;i++) {
        printf("key:%s\tvalue:%s\n",decod[i].key,decod[i].value);
    }

    return 0;
}

此代码读取名为 file.txt 的文件包含的所有行（最多 1024 行），将所有个性化的对（最多 1024 行）加载到 struct array decod 中，然后打印输出结构内容。

Answer 3

我写了这段代码，我认为它可以完成工作！这比我认为的公认答案更简单！它使用的内存与需要的内存一样多，仅此而已。

#include <stdlib.h>
#include <stdio.h>
#include <string.h>


struct key_value{
    char key[22];
    char value[22];
};

void parse_str(char* str, struct key_value** kv_arr, int* num){

    int  n    = 0;
    int  read = -1;
    char k[22];
    char v[22];
    int current_pos = 0;
    int consumed    = 0;

    /*counting number of key-value pairs*/
    while (1){
        if(current_pos > strlen(str)){
            break;
        }
        read = sscanf(str + current_pos, "%21[^:]:%21[^\n]\n%n", k, v, &consumed);
        current_pos += consumed;

        if(read == 2){
            ++n;
        }
    }
    printf("n = %d\n", n);
    *kv_arr = malloc(sizeof(struct key_value) * n);



    /*filling key_value array*/
    int i       = 0;
    read        = -1;
    current_pos = 0;
    consumed    = 0;
    while (1){
        if(current_pos > strlen(str)){
            break;
        }
        read = sscanf(str + current_pos, "%21[^:]:%21[^\n]\n%n", k, v, &consumed);
        current_pos += consumed;

        if(read == 2){
            struct key_value* kv = &((*kv_arr)[i]);
            strncpy(kv->key, k, 22);
            strncpy(kv->value, v, 22);
            ++i;
        }
    }

    *num = n;
}


int main(){

    char* str = "hola:hello\n"
                "que:what\n";

    int n;
    struct key_value* kv_arr;

    parse_str(str, &kv_arr, &n);

    for (int i = 0; i < n; ++i) {
        printf("%s  <--->  %s\n", kv_arr[i].key, kv_arr[i].value);
    }


    free(kv_arr);
    return 0;
}

输出：

n = 2
hola  <--->  hello
que  <--->  what

进程已完成，退出代码为 0

注意：sscanf 在 const char* 上运行，而不是来自文件的输入流，因此它 NOT 存储关于它消耗了什么的任何信息。

解决方案 ：我在格式字符串中使用 %n 来获取到目前为止它已经消耗的字符数（C89 标准）。

如何从缓冲区中逐字读取仅由“：”分隔的单词？

How to read word for word that are only separated by a ":" from the buffer?

c

buffer