如果符号之间没有space，C计数字程序就可以工作，为什么？

Question

这是我正在使用的代码，它可以完美运行，除非：

文本文件以整数开头。
如果符号之间没有 space，例如 [hi!] 可以完美运行，但 [hi !] 打印 �
如果我将多个符号放在一起，例如 [??????????]，字数将不正确。

澄清一下，如果代码中有整数是可以的，但如果它们在开头则不行。

为什么会这样？我真的很好奇如何解决这个问题。

编辑：有人向我指出 fscanf 不支持正则表达式，为什么它会正确过滤掉字母？

#include <stdio.h>
#include <stdlib.h>

/* counts words in the file */
int main(void) 
{
    FILE *fp;
    int r,n,i; /* a variable for result of a function, returning int */
     /* the words counter */
    const char *filename = "test2.txt"; /* a file name opening for read */
    char word[10]; /* an array for the check if a non-empty word was read */

    if ((fp = fopen(filename, "r")) == NULL) {
        fprintf(stderr, "error: file" "\n");
        return 1;
    }  
    /* if can't open the file for read
       then print an error message and return false to the environment */
    int arraylen = sizeof(word)/sizeof(word[0]); /*write the length of array word to arraylen*/ 
    n = 0; /* turn the counter of words to zero */
    word[0] = '[=11=]'; /* turn the word array to an empty state */
   while ((r = fscanf(fp, "\n%10[^A-Za-z]%*c", word)) == 1) {
        printf("firstoutput\n");
        for(i=0;i<arraylen;i++)

                printf("%c",word[i]);


    if((r = fscanf(fp, "\n%[A-Za-z]%*c", word)) == 0) { /*in case next character is not a 
        letter do nothing, this is in place to prevent the program from getting stuck*/

         printf("secondoutput\n");
        for(i=0;i<arraylen;i++)
            if(word[i] != (' '))
                printf("%c",word[i]);
        }

        if (word[0] != '[=11=]')
            n++;
        /* if the word array got something,
           then it was a word, count it */ 

        word[0] = '[=11=]'; /* turn the word back into an empty state */
    }  
    /* skipping words delimeted by ' ' or '\n' or ','
       while file fp can be read, continue skipping
       and count every skip */


    if (ferror(fp) != 0) { /* check the file for read error if EOF occured */
        fprintf(stderr, "error: read file" "\n");
        fclose(fp);
        return 1;
    }
    /* if there was an error while reading the file
       then print error, close the file (because it was opened though)
       and return false to the environment */

    if (n == 1) /* control "to be" and endings for word or words */
        printf("\nthere is %d word" "\n", n);
    else
        printf("\nthere are %d words" "\n", n);

    fclose(fp); /* close the file */

    return 0; /* return success to the environment */
}

EDIT2：我发布了一个 101% 有效的完整解决方案，我结合了我发现并创建的所有各种技巧和东西，我相信这个解决方案非常紧凑和高效，如果我错了请纠正我!

#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#define WORD "A-Za-z"


int countWords(FILE *f,int* now){
   char ch[100][100];
   int x;
   *now=0;
   int count = 0;
       while (fscanf(f, "%[^" WORD "]",ch[count]) != EOF
        && fscanf(f, "%10[" WORD "]",ch[count]) != EOF) {

        if(count>0) {

        for(x=0;x<count;x++) {

          if((strcasecmp(ch[count],ch[x]))==0) { /* Compare if the two words match, 
          case insensitive*/
            if(fscanf(f, "%[^" WORD "]",ch[count]) != EOF
            && fscanf(f, "%10[" WORD "]",ch[count]) != EOF) /* Since the words match,
            now we have to overwrite the double word by scanning the next set of characters.*/
              printf("String is equal\n");

              (*now)++;
          }
        }
      }
        puts(ch[count]);
        count++;
        (*now)++;
    }
    puts(ch[13]);
    printf("%d\n",*now );

   return count;
}

int main(void){

   int uniquewordCount=0,wordCount=0;
   FILE *rFile = fopen("test2.txt", "r");
   uniquewordCount += countWords(rFile,&wordCount);
   printf("%d\n",(a+b));
   printf("Amount of unique words: %d\n", uniquewordCount);
   printf("Amount of words: %d\n", wordCount);
   return 0;
}

Answer 1

您的第一个扫描命令是

fscanf(fp, "\n%10[^A-Za-z]%*c", word)

这将跳过任何白色-space ("\n")，然后读入最多 10 个不是字母的字符 ("%10[^A-Za-z]") 的缓冲区，最后读取字符在那之后（"*c"）。

当文件以字母开头时不要进入 while 循环，因为 fscanf 将 return 0，因为它无法扫描非空字符串非字母。如果之前连white space都跳过了，while循环只会在第一个非space字符不是字母的时候才进入，这种可能性不大。

因为您在 fscanf 末尾读了一个额外的字符，所以您的单词会漏读第一个字母。

通过将单词限制为 10 个字母，您可以分块阅读较长的单词，这应该会通过将较长的单词计算为两三个单词来减少您的单词计数。字符缓冲区 word 也应该至少有 11 个字符。

打印字符串的正确方法是printf("%s", word)或puts(word)。你的循环基本上没问题，但打印所有十个字母，即使这个词可能有更少的字母。

我不认为使用 fscanf 是计算单词的好方法，但如果你想使用它，你可以这样做：

#include <stdio.h>
#include <stdlib.h>

#define WORD "A-Za-z0-9'"

int main(void)
{
    FILE *fp = stdin;
    int n = 0;

    while (fscanf(fp, "%*[^" WORD "]") != EOF
        && fscanf(fp, "%*[" WORD "]") != EOF) {
        n++;
    }

    printf("%d words\n", n);

    return 0;
}

在这里，我们 fscanf 交替处理非单词和单词，并在其中一个 fscanf 发出文件结束信号时结束 lopp。请注意，我们不关心实际内容，因为我们通过不使用 * 星号转换任何内容来跳过它们。这意味着 fscanf 的唯一结果可以是 0 和 EOF.

计算单词的常用方法是读取字符并检测 "context" 何时从非单词转换为单词：

#include <stdio.h>
#include <stdlib.h>

int isword(int c)
{
    if ('A' <= c && c <= 'Z') return 1;
    if ('a' <= c && c <= 'z') return 1;
    if ('0' <= c && c <= '9') return 1;
    if (c == '-') return 1;
    if (c == '\'') return 1;
    return 0;
}

int main(void)
{
    FILE *fp = stdin;       // or use a real file, of course
    int word = 0;           // 0: space cntext, 1: word context
    int n = 0;

    for (;;) {
        int c = fgetc(fp);

        if (c == EOF) break;

        if (isword(c)) {
            if (word == 0) n++;
            word = 1;
        } else {
            word = 0;
        }
    }

    printf("%d words\n", n);

    return 0;
}

函数isword在这里定义了什么算作一个词。请注意如何不需要保持两个 fscanf 格式同步。一个字母是一个词的一部分，或者不是。, 这与 else 子句是明确的。

如果符号之间没有space，C计数字程序就可以工作，为什么？

C counting word program works if there is no space between symbols, why?

c

regex

words

scanf

count