在 C++ 中不使用矢量解析 csv 文件

Parsing csv file without using vector in C++

我正在 Visual Studio 上处理 C++ 项目。我有一个 csv 文件,看起来像:

"0","12312415"," my whole body is tired"
"0","12365448","I just want to rest my ears because of I do not see"
"0",123156984","I want to go to cinema with my girls friend. I am so tired"

所以,我想在不使用向量的情况下解析这些数据并将其放入数组中。然后我将找到常用词放入数组的最后一项。我的预期输出如下:

<I> <four times count
<my> <three times count>
<to> <three times count>

有什么办法吗?我使用此代码进行排序,但我不知道如何转换我的代码,以读取数据并将数据放入数组。

void heapify(int arr[], int n, int i)
{
    int largest = i; // Initialize largest as root
    int l = 2 * i + 1; // left = 2*i + 1
    int r = 2 * i + 2; // right = 2*i + 2

    // If left child is larger than root
    if (l < n && arr[l] > arr[largest])
        largest = l;

    // If right child is larger than largest so far
    if (r < n && arr[r] > arr[largest])
        largest = r;

    //If largest is not root
    if (largest != i)
    {
        swap(arr[i], arr[largest]);

        //Recursively heapfy the affected sub-tree
        heapify(arr, n, largest);
    }
}

// main function to do heap sort

void heapSort(int arr[], int n)
{

    // Build heap (rearrange array)
    for (int i = n / 2 - 1; i >= 0; i--)
        heapify(arr, n, i);

    //One by one extract an element from heap
    for (int i = n - 1; i >= 0; i--)
    {
        // Move current root to end
        swap(arr[0], arr[i]);

        // Call max heapify on the reduced heap
        heapify(arr, i, 0);
    }
}

// A utility function to print array of size n

void printArray(int arr[], int n)
{
    for (int i = 0; i < n; ++i)
        cout << arr[i] << " ";
    cout << "\n";
}
int main()
{
    clock_t begin = clock();
    int arr[] = { 12,11,13,5,6,7,62,25,27 };
    int n = sizeof(arr) / sizeof(arr[0]);

    heapSort(arr, n);
    cout << "Sorted array is \n";

    printArray(arr, n);

    clock_t end = clock();
    float elapsed_secs = float(end - begin) / CLOCKS_PER_SEC;
    cout << "Time elapsed Time: " << elapsed_secs << " seconds" << endl;

    system("PAUSE");
    return 0;
}

因为您不想为此使用 std::vector, which is the recommended way, you should use a 2D array for reading the csv file. The first dimension of the array is the number of lines, and the second dimension is the number of fields. In your case, both dimensions are equal to 3. Check Read csv file using 2d array


准备好二维数组后,您需要计算每个单词的出现频率。为此,您可以使用成对的一维数组,其中第一个字段是单词,第二个字段是频率。为此,您可以遍历二维数组,获取其最后一个字段(句子),然后用空格分隔该字符串(句子)。然后,对于每个单词,您将检查它是否出现在成对数组中,如果是,则将其频率增加一个(因为您已经看过该单词,现在又看到了)。如果没有,将它插入到那个数组中,并将它的频率设置为 1,因为你是第一次看到这个词。

对数组的大小是多少?由于您不想使用 std::vector,它会自动处理动态增长,就像您向其中插入元素一样,因此您需要考虑该大小。

由于 csv 文件中的字数未知,您需要考虑文件的最大字数。确保它是一个大尺寸,以便能够存储您将看到的所有单词。另一方面,不要设置太大,因为你会分配太多内存,那会浪费。

设置大小后,您将使用一个计数器来保存实际读取的字数。这样,您就会知道数组的有意义的大小,当您想遍历该数组时会使用它,例如,打印它。


然后,您将对数组进行排序。 std::sort 非常适合这个,您应该在其中定义一个函数来比较您要排序的数组的元素。

最后,您将只打印频率大于 1 的单词;这些是常用词。


将所有内容放在一起,我们得到:

#include <iostream>
#include <fstream>
#include <string>
#include <sstream>
#include <algorithm>
using namespace std;

// search in the array of pairs for the 'word'. Check only the first 'count' pairs.
int search_pair_in_array(const pair<string, int> wordFr[], const int count, const string word)
{
  for(int i = 0; i < count; ++i)
    if(wordFr[i].first == word)
      return i;
  return -1;
}

// compare function to be used by std::sort
bool pairCmp(const pair<string, int>& wordFr1, const pair<string, int>& wordFr2)
{ 
  return (wordFr1.second > wordFr2.second);
}

int main()
{
  // number of lines and number of tokens per line
  const int N = 3, M = 3;
  // line read from file, 2D array of lines read from line
  string line, lines[N][M];
  ifstream csvFile("myFile.csv");

  if(csvFile.is_open())
  {
    cout << "Successfully opened file"<<endl;

    int i = 0, j = 0;
    // read from 'csvFile', store to 'line', and use comma as the delimeter
    while(getline(csvFile, line, ','))
    {
      //cout << "|" << line << "|" << endl;
      size_t found = line.find("\n");
      if (found != std::string::npos) // if newline was found
      {
        string lastToken = line.substr(0, found);
        string nextLineFirstTOken = line.substr(found + 1);
        lines[i++][j] = lastToken.substr(1, lastToken.size() - 2);
        j = 0;
        if(nextLineFirstTOken != "") // when you read the last token of the last line          
          lines[i][j++] = nextLineFirstTOken.substr(1, nextLineFirstTOken.size() - 2);
      }
      else
      {
        // to not copy the double quotes from first and last character
        lines[i][j++] = line.substr(1, line.size() - 2);
      }
    }

    // for(int i = 0; i < N; ++i)
    // {
    //   for(int j = 0; j < M; ++j)
    //   {
    //     cout << lines[i][j] << " ";
    //   }
    //   cout << endl;
    // }

    // max number of words
    const int W = 100;
    // array of pairs that stores a word and its frequency per cell
    pair<string, int> wordFr[W];
    // number of words (to be updated totally after the for loop)
    int count = 0;
    // for every line of the 2D array
    for(int i = 0; i < N; ++i)
    {
      string word;
      // get the last field (the sentence) of the i-th line
      stringstream ss(lines[i][M - 1]);
      // split sentence to words (implicit delimeter: space)
      // for every word in the sentence, do:
      while (ss >> word)
      {
        //cout << word << " " << search_pair_in_array(wordFr, W, word) << endl;

        // check if word already in array of pairs
        int idx = search_pair_in_array(wordFr, W, word);
        // not found, insert the word in array of pairs, set its frequency to 1 (shown that word for first time)
        if(idx == -1)
          wordFr[count++] = make_pair(word, 1);
        // word found in array of pairs, increase it frequency by one
        else
          wordFr[idx].second++;
      }
    }

    // sort the array 'wordFr', by using 'pairCmp' as the compare function. Notice that we care only for the first 'count' elements of the array.
    sort (wordFr, wordFr + count, pairCmp);

    cout << "Word, Frequency\n";
    for(int i = 0; i < count; ++i)
      if(wordFr[i].second > 1) // print only common words (assuming that a word with frequency > 1 is present in another sentence too)
        cout << wordFr[i].first << ", " << wordFr[i].second << endl;
  }
  return 0;
}

输出:

Successfully opened file
Word, Frequency
I, 4
my, 3
to, 3
want, 2
tired, 2