在 java 中通过 Spark 存储 orc 格式
Storing orc format through Spark in java
我正在使用 spark 1.3.1 并且我想将数据以 ORC 格式存储在 hive 中..
以下显示错误的行,似乎 orc 不支持作为 spark 1.3.1 中的数据源
dataframe.save("/apps/hive/warehouse/person_orc_table_5", "orc");
java.lang.RuntimeException: Failed to load class for data source: orc
at scala.sys.package$.error(package.scala:27)
at org.apache.spark.sql.sources.ResolvedDataSource$.lookupDataSource(ddl.scala:194)
at org.apache.spark.sql.sources.ResolvedDataSource$.apply(ddl.scala:237)
at org.apache.spark.sql.DataFrame.save(DataFrame.scala:1196)
at org.apache.spark.sql.DataFrame.save(DataFrame.scala:1156)
at SparkOrcHive.main(SparkOrcHive.java:62)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:577)
at org.apache.spark.deploy.SparkSubmit$.doRunMain(SparkSubmit.scala:174)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:197)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:112)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Spark 1.4 有..
write.format("orc").partitionBy("age").save("peoplePartitioned")
以 orc 格式存储..
有什么方法可以在 spark 1.3.1 中以 ORC 格式存储文件吗??
谢谢,
(
dataframe.select("name", "age")
.save("/apps/hive/warehouse/orc_table",
"org.apache.spark.sql.hive.orc",
SaveMode.Append);
)
编辑:
我正在从 hdfs 获取 txt 文件并以 orc 格式将数据写入配置单元 table。下面的代码在 spark 1.3.1
中对我来说工作正常
java class
package com.test.spark;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.hive.HiveContext;
/**
* Created by ankit on 08/02/16.
*/
public class SparkOrcHiveInsert {
public static void main(String[] args) {
String tableName = "person_orc";
String tablePath = "/apps/hive/warehouse/" + tableName;
SparkConf conf = new SparkConf().setAppName("ORC Demo").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
HiveContext hiveContext = new org.apache.spark.sql.hive.HiveContext(sc.sc());
JavaRDD<Person> people = sc.textFile("hdfs://~:8020/tmp/person.txt").map(
new Function<String, Person>() {
public Person call(String line) throws Exception {
return process(line);
}
});
DataFrame schemaPeople = hiveContext.createDataFrame(people, Person.class);
schemaPeople.select("id","name", "age").save(tablePath, "org.apache.spark.sql.hive.orc", SaveMode.Append);
}
private static Person process(String line) {
String[] parts = line.split(",");
Person person = new Person();
person.setId(Integer.parseInt(parts[0].trim()));
person.setName(parts[1]);
person.setAge(Integer.parseInt(parts[2].trim()));
return person;
}
}
Hive table 脚本
create table person_orc (
id int,
name string,
age int
) stored as orc tblproperties ("orc.compress"="NONE");
Spark 提交命令
~/spark/bin/spark-submit --master local --class com.test.spark.SparkOrcHiveInsert spark-orc-hive-1.0.jar
我正在使用 spark 1.3.1 并且我想将数据以 ORC 格式存储在 hive 中..
以下显示错误的行,似乎 orc 不支持作为 spark 1.3.1 中的数据源
dataframe.save("/apps/hive/warehouse/person_orc_table_5", "orc");
java.lang.RuntimeException: Failed to load class for data source: orc
at scala.sys.package$.error(package.scala:27)
at org.apache.spark.sql.sources.ResolvedDataSource$.lookupDataSource(ddl.scala:194)
at org.apache.spark.sql.sources.ResolvedDataSource$.apply(ddl.scala:237)
at org.apache.spark.sql.DataFrame.save(DataFrame.scala:1196)
at org.apache.spark.sql.DataFrame.save(DataFrame.scala:1156)
at SparkOrcHive.main(SparkOrcHive.java:62)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:577)
at org.apache.spark.deploy.SparkSubmit$.doRunMain(SparkSubmit.scala:174)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:197)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:112)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Spark 1.4 有..
write.format("orc").partitionBy("age").save("peoplePartitioned")
以 orc 格式存储..
有什么方法可以在 spark 1.3.1 中以 ORC 格式存储文件吗??
谢谢,
(
dataframe.select("name", "age")
.save("/apps/hive/warehouse/orc_table",
"org.apache.spark.sql.hive.orc",
SaveMode.Append);
)
编辑:
我正在从 hdfs 获取 txt 文件并以 orc 格式将数据写入配置单元 table。下面的代码在 spark 1.3.1
中对我来说工作正常java class
package com.test.spark;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.hive.HiveContext;
/**
* Created by ankit on 08/02/16.
*/
public class SparkOrcHiveInsert {
public static void main(String[] args) {
String tableName = "person_orc";
String tablePath = "/apps/hive/warehouse/" + tableName;
SparkConf conf = new SparkConf().setAppName("ORC Demo").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
HiveContext hiveContext = new org.apache.spark.sql.hive.HiveContext(sc.sc());
JavaRDD<Person> people = sc.textFile("hdfs://~:8020/tmp/person.txt").map(
new Function<String, Person>() {
public Person call(String line) throws Exception {
return process(line);
}
});
DataFrame schemaPeople = hiveContext.createDataFrame(people, Person.class);
schemaPeople.select("id","name", "age").save(tablePath, "org.apache.spark.sql.hive.orc", SaveMode.Append);
}
private static Person process(String line) {
String[] parts = line.split(",");
Person person = new Person();
person.setId(Integer.parseInt(parts[0].trim()));
person.setName(parts[1]);
person.setAge(Integer.parseInt(parts[2].trim()));
return person;
}
}
Hive table 脚本
create table person_orc (
id int,
name string,
age int
) stored as orc tblproperties ("orc.compress"="NONE");
Spark 提交命令
~/spark/bin/spark-submit --master local --class com.test.spark.SparkOrcHiveInsert spark-orc-hive-1.0.jar