为什么 rcpp 函数很慢?
Why is the rcpp function slow?
我做了一个简单的函数,如果 Reversal_Accounting_Transaction_ID 中的值出现在 Accounting_Transaction_ID 列的任何位置(即在其他行)。Reversal_Accounting_Transaction_ID 列中的大多数条目可能为空,因此应为 "no"。
数据框是从一个 6gb 的 csv 文件(假设大约 600 万行)创建的,并且正在数据块上进行处理。
我不太确定为什么要花这么长时间
Rcpp::cppFunction('
std::vector<std::string>
reversals(DataFrame frame)
{
std::vector<std::string> Accounting_Transaction_ID = as<std::vector<std::string> >(frame["BELNR"]);
std::vector<std::string> Reversal_Accounting_Transaction_ID = as<std::vector<std::string> >(frame["STBLG"]);
std::vector<std::string> ReversalIndicator(Reversal_Accounting_Transaction_ID.size()) ;
if (Reversal_Accounting_Transaction_ID.size() == 0) {
return ReversalIndicator;
}
int dfSize = Reversal_Accounting_Transaction_ID.size();
for (int i = 0; i < dfSize; ++i) {
if (Reversal_Accounting_Transaction_ID[i] != "") {
for (int j = 1; j < dfSize; ++j) {
if(Accounting_Transaction_ID[j]== Reversal_Accounting_Transaction_ID[i]){
ReversalIndicator[i]="Yes";
break;
}
else if( (j== dfSize -1) ){
ReversalIndicator[i]="No";
}
}
}
else{
ReversalIndicator[i]="No";
}
}
return ReversalIndicator;
}
')```
```df$reversal=reversals(df)```
您正在为数据框的每一行遍历数据框,即您有类似 6m x 6m 的操作 (O(N^2))。这可能需要相当长的时间。但是,您可以从 O(N^2) 到 O(N),但要牺牲一些内存。没有任何样本数据我无法测试这个,所以我只提供一些伪代码:
create empty set data structure
for each row in df:
Add Reversal_Accounting_Transaction_ID to set
for each row in df:
if Accounting_Transaction_ID can be found in set
ReversalIndicator = "Yes"
基于拉尔夫的回答
不确定是否需要在开始时为集合分配大小?
Rcpp::cppFunction('
std::vector<std::string> reversals(DataFrame frame)
{
std::vector<std::string> Accounting_Transaction_ID = as<std::vector<std::string> >(frame["BELNR"]);
std::vector<std::string> Reversal_Accounting_Transaction_ID = as<std::vector<std::string> >(frame["STBLG"]);
std::vector<std::string> ReversalIndicator(Reversal_Accounting_Transaction_ID.size()) ;
std::set<std::string> uniqueTransID;
if (Reversal_Accounting_Transaction_ID.size() == 0) {
return ReversalIndicator;
}
int dfSize = Reversal_Accounting_Transaction_ID.size();
for (int i = 0; i < dfSize; ++i) {
uniqueTransID.insert(Accounting_Transaction_ID[i]);
}
for (int i = 0; i < dfSize; ++i) {
if (Reversal_Accounting_Transaction_ID[i] !=""){
ReversalIndicator[i]="No";
continue;
}
if (uniqueTransID.find(Reversal_Accounting_Transaction_ID[i]) != uniqueTransID.end()) {
ReversalIndicator[i]="Yes";
}
else{
ReversalIndicator[i]="No";
}
}
return ReversalIndicator;
}
')```
我做了一个简单的函数,如果 Reversal_Accounting_Transaction_ID 中的值出现在 Accounting_Transaction_ID 列的任何位置(即在其他行)。Reversal_Accounting_Transaction_ID 列中的大多数条目可能为空,因此应为 "no"。
数据框是从一个 6gb 的 csv 文件(假设大约 600 万行)创建的,并且正在数据块上进行处理。
我不太确定为什么要花这么长时间
Rcpp::cppFunction('
std::vector<std::string>
reversals(DataFrame frame)
{
std::vector<std::string> Accounting_Transaction_ID = as<std::vector<std::string> >(frame["BELNR"]);
std::vector<std::string> Reversal_Accounting_Transaction_ID = as<std::vector<std::string> >(frame["STBLG"]);
std::vector<std::string> ReversalIndicator(Reversal_Accounting_Transaction_ID.size()) ;
if (Reversal_Accounting_Transaction_ID.size() == 0) {
return ReversalIndicator;
}
int dfSize = Reversal_Accounting_Transaction_ID.size();
for (int i = 0; i < dfSize; ++i) {
if (Reversal_Accounting_Transaction_ID[i] != "") {
for (int j = 1; j < dfSize; ++j) {
if(Accounting_Transaction_ID[j]== Reversal_Accounting_Transaction_ID[i]){
ReversalIndicator[i]="Yes";
break;
}
else if( (j== dfSize -1) ){
ReversalIndicator[i]="No";
}
}
}
else{
ReversalIndicator[i]="No";
}
}
return ReversalIndicator;
}
')```
```df$reversal=reversals(df)```
您正在为数据框的每一行遍历数据框,即您有类似 6m x 6m 的操作 (O(N^2))。这可能需要相当长的时间。但是,您可以从 O(N^2) 到 O(N),但要牺牲一些内存。没有任何样本数据我无法测试这个,所以我只提供一些伪代码:
create empty set data structure
for each row in df:
Add Reversal_Accounting_Transaction_ID to set
for each row in df:
if Accounting_Transaction_ID can be found in set
ReversalIndicator = "Yes"
基于拉尔夫的回答
不确定是否需要在开始时为集合分配大小?
Rcpp::cppFunction('
std::vector<std::string> reversals(DataFrame frame)
{
std::vector<std::string> Accounting_Transaction_ID = as<std::vector<std::string> >(frame["BELNR"]);
std::vector<std::string> Reversal_Accounting_Transaction_ID = as<std::vector<std::string> >(frame["STBLG"]);
std::vector<std::string> ReversalIndicator(Reversal_Accounting_Transaction_ID.size()) ;
std::set<std::string> uniqueTransID;
if (Reversal_Accounting_Transaction_ID.size() == 0) {
return ReversalIndicator;
}
int dfSize = Reversal_Accounting_Transaction_ID.size();
for (int i = 0; i < dfSize; ++i) {
uniqueTransID.insert(Accounting_Transaction_ID[i]);
}
for (int i = 0; i < dfSize; ++i) {
if (Reversal_Accounting_Transaction_ID[i] !=""){
ReversalIndicator[i]="No";
continue;
}
if (uniqueTransID.find(Reversal_Accounting_Transaction_ID[i]) != uniqueTransID.end()) {
ReversalIndicator[i]="Yes";
}
else{
ReversalIndicator[i]="No";
}
}
return ReversalIndicator;
}
')```