如何使用 Java 在 Spark 中组合或合并两个稀疏向量?
How to combine or merge two sparse vectors in Spark using Java?
我使用了 Java 的 API,即 Apache-Spark 1.2.0,并创建了两个解析向量,如下所示。
Vector v1 = Vectors.sparse(3, new int[]{0, 2}, new double[]{1.0, 3.0});
Vector v2 = Vectors.sparse(2, new int[]{0, 1}, new double[]{4,5});
如何得到v1
和v2
组合而成的新向量v3
,所以结果应该是:(5, [0,2,3,4],[1.0, 3.0, 4.0, 5.0])
我发现这个问题已经一年了,仍然悬而未决。这里我自己写了一个辅助函数解决了这个问题,如下。
public static SparseVector combineSparseVectors(SparseVector... svs) {
int size = 0;
int nonzeros = 0;
for (SparseVector sv : svs) {
size += sv.size();
nonzeros += sv.indices().length;
}
if (nonzeros != 0) {
int[] indices = new int[nonzeros];
double[] values = new double[nonzeros];
int pointer_D = 0;
int totalPt_D = 0;
int pointer_V = 0;
for (SparseVector sv : svs) {
int[] indicesSV = sv.indices();
for (int i : indicesSV) {
indices[pointer_D++] = i + totalPt_D;
}
totalPt_D += sv.size();
double[] valuesSV = sv.values();
for (double d : valuesSV) {
values[pointer_V++] = d;
}
}
return new SparseVector(size, indices, values);
} else {
System.out.println("all zeroes");
return null;
}
}
根据 HappyCoding 的回答,我只想使用 pyspark
和 numpy
在 Python 3.x 中贡献我的代码
import numpy as np
from pyspark.mllib.linalg import SparseVector
def concat_sparse_vectors(sparse_vectors):
"""
Input:
sparse_vectors (list): A list containing sparse vectors
Output:
A concatenated sparse vector
"""
size = 0
nonzeros = 0
for vector in sparse_vectors:
size += vector.size
nonzeros += len(vector.indices)
if nonzeros != 0:
indices = np.zeros(nonzeros)
values = np.zeros(nonzeros)
pointer_D = 0 # A pointer pointing to the index where a non-zero value occupied in the 'values' array
total_point_D = 0 # A displacement value for 'pointer_D' in the concatenated array which is 'values' in the code
pointer_V = 0 # A pointer pointing to a value in the 'values' array in the code
for vector in sparse_vectors:
vector_indices = vector.indices
for i in vector_indices:
indices[pointer_D] = i + total_point_D
pointer_D += 1
total_point_D += vector.size
vector_values = vector.values
for value in vector_values:
values[pointer_V] = value
pointer_V += 1
return SparseVector(size, indices, values)
else:
print('All zeros')
我使用了 Java 的 API,即 Apache-Spark 1.2.0,并创建了两个解析向量,如下所示。
Vector v1 = Vectors.sparse(3, new int[]{0, 2}, new double[]{1.0, 3.0});
Vector v2 = Vectors.sparse(2, new int[]{0, 1}, new double[]{4,5});
如何得到v1
和v2
组合而成的新向量v3
,所以结果应该是:(5, [0,2,3,4],[1.0, 3.0, 4.0, 5.0])
我发现这个问题已经一年了,仍然悬而未决。这里我自己写了一个辅助函数解决了这个问题,如下。
public static SparseVector combineSparseVectors(SparseVector... svs) {
int size = 0;
int nonzeros = 0;
for (SparseVector sv : svs) {
size += sv.size();
nonzeros += sv.indices().length;
}
if (nonzeros != 0) {
int[] indices = new int[nonzeros];
double[] values = new double[nonzeros];
int pointer_D = 0;
int totalPt_D = 0;
int pointer_V = 0;
for (SparseVector sv : svs) {
int[] indicesSV = sv.indices();
for (int i : indicesSV) {
indices[pointer_D++] = i + totalPt_D;
}
totalPt_D += sv.size();
double[] valuesSV = sv.values();
for (double d : valuesSV) {
values[pointer_V++] = d;
}
}
return new SparseVector(size, indices, values);
} else {
System.out.println("all zeroes");
return null;
}
}
根据 HappyCoding 的回答,我只想使用 pyspark
和 numpy
import numpy as np
from pyspark.mllib.linalg import SparseVector
def concat_sparse_vectors(sparse_vectors):
"""
Input:
sparse_vectors (list): A list containing sparse vectors
Output:
A concatenated sparse vector
"""
size = 0
nonzeros = 0
for vector in sparse_vectors:
size += vector.size
nonzeros += len(vector.indices)
if nonzeros != 0:
indices = np.zeros(nonzeros)
values = np.zeros(nonzeros)
pointer_D = 0 # A pointer pointing to the index where a non-zero value occupied in the 'values' array
total_point_D = 0 # A displacement value for 'pointer_D' in the concatenated array which is 'values' in the code
pointer_V = 0 # A pointer pointing to a value in the 'values' array in the code
for vector in sparse_vectors:
vector_indices = vector.indices
for i in vector_indices:
indices[pointer_D] = i + total_point_D
pointer_D += 1
total_point_D += vector.size
vector_values = vector.values
for value in vector_values:
values[pointer_V] = value
pointer_V += 1
return SparseVector(size, indices, values)
else:
print('All zeros')