在不使用 JOIN 的情况下,在 Google 工作表中的逗号分隔值列中获取唯一值和它们的计数?
Getting the unique values and counts of them in a column of comma separated values in Google Sheets without using JOIN?
我有一个列,里面有一堆成分列表。我想弄清楚不同的成分出现了多少次。有 73,000 行。此 question 的答案适用于 Google Sheet 秒内的少量数据。
公式为=UNIQUE(TRANSPOSE(SPLIT(JOIN(", ";A2:A);", ";FALSE)))
但是我在这里写了 50000 多个字符 JOIN
不堪重负。还有其他方法可以解决这个问题吗?
Sheet: https://docs.google.com/spreadsheets/d/1t0P9hMmVpwhI2IbATmIMjobuALTg8VWhl8-AQaq3zIo/edit?usp=sharing
=UNIQUE(TRANSPOSE(SPLIT(REGEXREPLACE(TRANSPOSE(
QUERY(ARRAYFORMULA(","&A1:A),,5000000))," ,",","),",")))
但也许您需要这个 (?):
=QUERY(TRANSPOSE(SPLIT(REGEXREPLACE(TRANSPOSE(
QUERY(ARRAYFORMULA(","&A1:A),,5000000))," ,",","),",")),
"select Col1,count(Col1)
where Col1 is not null
group by Col1
label count(Col1)''")
=ARRAYFORMULA(UNIQUE(TRIM(TRANSPOSE(SPLIT(TRANSPOSE(
QUERY(","&A1:A,,5000000)),",")))))
=QUERY(QUERY(ARRAYFORMULA(TRIM(TRANSPOSE(SPLIT(TRANSPOSE(
QUERY(","&A1:A,,5000000)),",")))),
"select Col1,count(Col1)
where Col1 is not null
group by Col1
label count(Col1)''"),
"order by Col2 desc")
我做了一个 google 脚本解决方案,因为我想玩一下键映射对。
function myFunction() {
var myMap = {"candy":0};
var sh = SpreadsheetApp.getActiveSpreadsheet();
var ss = sh.getSheetByName("FIRSTSHEETNAME");
var os = sh.getSheetByName("Ingredients");
var data = ss.getDataRange().getValues();
for (var i=0; i<data.length;i++)//full
//for (var i=1; i<4000;i++)//test
{
var array = data[i][0].split( ",");
for (var j=0; j<array.length;j++)
{
var item = array[j];
//Logger.log(array[j]);
if (myMap[item]>-1){
//Logger.log("REPEAT INGREDIENT");
var num = parseInt(myMap[item]);
num++;
myMap[item]=num;
//Logger.log(item +" "+num);
} else {
myMap[item]=1;
//Logger.log("New Ingredient: "+item);
//Logger.log(myMap);
}
}
}
//Logger.log(myMap);
var output=[];
for (var key in myMap){
//Logger.log("Ack");
output.push([key,myMap[key]]);
}
//Logger.log(output);
os.getRange(2,1,output.length,output[0].length).setValues(output);
}
您需要为输出添加一个 "Ingredients" 选项卡,并将您的第一个选项卡更改为名为 FIRSTSHEETNAME(或更改代码)。在我的测试中,4 件物品需要 4 秒,400 件物品需要 5 秒,4000 件物品需要 6 秒。前导空格可能存在问题,但这为您提供了一个起点。
适用于至少 40,000 行的列的快速 运行 公式:
=query(arrayformula(TRIM(flatten(split(A2:A20000,",")))),"select Col1,Count(Col1) Where NOT (Col1='' OR Col1 contains '#VALUE!') Group By Col1 order by Count(Col1) desc label Col1 'Ingredient',Count(Col1) 'Freq.'")
FLATTEN function, combined with SQL (QUERY function) 可以作为快速过滤值(例如空或错误消息)的解决方案。
TRIM function 避免了由于每个字符串无意义的空格 before/after 而导致的结果中的伪影。
Sheet: https://docs.google.com/spreadsheets/d/1m9EvhQB1Leg2H7L52WhPe66_jRrTc8VsnZcliQsxJ7s/edit?usp=sharing
*如果出现错误的大小写差异,您可以在使用 UPPER(A2:A20000).
的同一公式之前将字符串的所有字符标准化为大写
我有一个列,里面有一堆成分列表。我想弄清楚不同的成分出现了多少次。有 73,000 行。此 question 的答案适用于 Google Sheet 秒内的少量数据。
公式为=UNIQUE(TRANSPOSE(SPLIT(JOIN(", ";A2:A);", ";FALSE)))
但是我在这里写了 50000 多个字符 JOIN
不堪重负。还有其他方法可以解决这个问题吗?
Sheet: https://docs.google.com/spreadsheets/d/1t0P9hMmVpwhI2IbATmIMjobuALTg8VWhl8-AQaq3zIo/edit?usp=sharing
=UNIQUE(TRANSPOSE(SPLIT(REGEXREPLACE(TRANSPOSE(
QUERY(ARRAYFORMULA(","&A1:A),,5000000))," ,",","),",")))
但也许您需要这个 (?):
=QUERY(TRANSPOSE(SPLIT(REGEXREPLACE(TRANSPOSE(
QUERY(ARRAYFORMULA(","&A1:A),,5000000))," ,",","),",")),
"select Col1,count(Col1)
where Col1 is not null
group by Col1
label count(Col1)''")
=ARRAYFORMULA(UNIQUE(TRIM(TRANSPOSE(SPLIT(TRANSPOSE(
QUERY(","&A1:A,,5000000)),",")))))
=QUERY(QUERY(ARRAYFORMULA(TRIM(TRANSPOSE(SPLIT(TRANSPOSE(
QUERY(","&A1:A,,5000000)),",")))),
"select Col1,count(Col1)
where Col1 is not null
group by Col1
label count(Col1)''"),
"order by Col2 desc")
我做了一个 google 脚本解决方案,因为我想玩一下键映射对。
function myFunction() {
var myMap = {"candy":0};
var sh = SpreadsheetApp.getActiveSpreadsheet();
var ss = sh.getSheetByName("FIRSTSHEETNAME");
var os = sh.getSheetByName("Ingredients");
var data = ss.getDataRange().getValues();
for (var i=0; i<data.length;i++)//full
//for (var i=1; i<4000;i++)//test
{
var array = data[i][0].split( ",");
for (var j=0; j<array.length;j++)
{
var item = array[j];
//Logger.log(array[j]);
if (myMap[item]>-1){
//Logger.log("REPEAT INGREDIENT");
var num = parseInt(myMap[item]);
num++;
myMap[item]=num;
//Logger.log(item +" "+num);
} else {
myMap[item]=1;
//Logger.log("New Ingredient: "+item);
//Logger.log(myMap);
}
}
}
//Logger.log(myMap);
var output=[];
for (var key in myMap){
//Logger.log("Ack");
output.push([key,myMap[key]]);
}
//Logger.log(output);
os.getRange(2,1,output.length,output[0].length).setValues(output);
}
您需要为输出添加一个 "Ingredients" 选项卡,并将您的第一个选项卡更改为名为 FIRSTSHEETNAME(或更改代码)。在我的测试中,4 件物品需要 4 秒,400 件物品需要 5 秒,4000 件物品需要 6 秒。前导空格可能存在问题,但这为您提供了一个起点。
适用于至少 40,000 行的列的快速 运行 公式:
=query(arrayformula(TRIM(flatten(split(A2:A20000,",")))),"select Col1,Count(Col1) Where NOT (Col1='' OR Col1 contains '#VALUE!') Group By Col1 order by Count(Col1) desc label Col1 'Ingredient',Count(Col1) 'Freq.'")
FLATTEN function, combined with SQL (QUERY function) 可以作为快速过滤值(例如空或错误消息)的解决方案。
TRIM function 避免了由于每个字符串无意义的空格 before/after 而导致的结果中的伪影。
Sheet: https://docs.google.com/spreadsheets/d/1m9EvhQB1Leg2H7L52WhPe66_jRrTc8VsnZcliQsxJ7s/edit?usp=sharing
*如果出现错误的大小写差异,您可以在使用 UPPER(A2:A20000).
的同一公式之前将字符串的所有字符标准化为大写