如何让 Spark2.3 在 Jupyter Notebook 中工作
How to get Spark2.3 working in Jupyter Notebook
我现在正在努力让 Spark2.3 在 Jupyter Notebook 中工作。
目前我创建的内核如下:
- 创建环境文件:
$ cat rxie20181012-pyspark.yml
name: rxie20181012-pyspark
dependencies:
- pyspark
- 根据环境文件创建环境
conda env create -f rxie20181012-pyspark.yml
- 激活新环境:
source activate rxie20181012-pyspark
- 基于 conda env 创建内核:
sudo ./python -m ipykernel install --name rxie20181012-pyspark
--display-name "Python (rxie20181012-pyspark)"
- kernel.json如下:
cat /usr/local/share/jupyter/kernels/rxie20181012-pyspark/kernel.json
{
"display_name": "Python (rxie20181012-pyspark)",
"language": "python",
"argv": [
"/opt/cloudera/parcels/Anaconda-4.2.0/bin/python",
"-m",
"ipykernel",
"-f",
"{connection_file}"
]
}
- 注意到笔记本导入 pyspark 失败后,我将如下的 env 部分添加到 kernel.json:
{
"display_name": "Python (rxie20181012-pyspark)",
"language": "python",
"argv": [
"/opt/cloudera/parcels/Anaconda-4.2.0/bin/python",
"-m",
"ipykernel",
"-f",
"{connection_file}"
],
"env": {
"HADOOP_CONF_DIR": "/etc/spark2/conf/yarn-conf",
"PYSPARK_PYTHON":"/opt/cloudera/parcels/Anaconda/bin/python",
"SPARK_HOME": "/opt/cloudera/parcels/SPARK2",
"PYTHONPATH": "/opt/cloudera/parcels/SPARK2/lib/spark2/python/lib/py4j-0.10.7-src.zip:/opt/cloudera/parcels/SPARK2/lib/spark2/python/",
"PYTHONSTARTUP": "/opt/cloudera/parcels/SPARK2/lib/spark2/python/pyspark/shell.py",
"PYSPARK_SUBMIT_ARGS": " --master yarn --deploy-mode client pyspark-shell"
}
}
现在导入 pyspark 时没有更多错误,但仍然无法启动 sparksession:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('abc').getOrCreate()
OSErrorTraceback (most recent call last)
in ()
----> 1 spark = SparkSession.builder.appName('abc').getOrCreate()
/opt/cloudera/parcels/SPARK2/lib/spark2/python/pyspark/sql/session.pyc
in getOrCreate(self)
171 for key, value in self._options.items():
172 sparkConf.set(key, value)
--> 173 sc = SparkContext.getOrCreate(sparkConf)
174 # This SparkContext may be an existing one.
175 for key, value in self._options.items():
/opt/cloudera/parcels/SPARK2/lib/spark2/python/pyspark/context.pyc in
getOrCreate(cls, conf)
341 with SparkContext._lock:
342 if SparkContext._active_spark_context is None:
--> 343 SparkContext(conf=conf or SparkConf())
344 return SparkContext._active_spark_context
345
/opt/cloudera/parcels/SPARK2/lib/spark2/python/pyspark/context.pyc in
init(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, gateway, jsc, profiler_cls)
113 """
114 self._callsite = first_spark_call() or CallSite(None, None, None)
--> 115 SparkContext._ensure_initialized(self, gateway=gateway, conf=conf)
116 try:
117 self._do_init(master, appName, sparkHome, pyFiles, environment, batchSize, serializer,
/opt/cloudera/parcels/SPARK2/lib/spark2/python/pyspark/context.pyc in
_ensure_initialized(cls, instance, gateway, conf)
290 with SparkContext._lock:
291 if not SparkContext._gateway:
--> 292 SparkContext._gateway = gateway or launch_gateway(conf)
293 SparkContext._jvm = SparkContext._gateway.jvm
294
/opt/cloudera/parcels/SPARK2/lib/spark2/python/pyspark/java_gateway.pyc
in launch_gateway(conf)
81 def preexec_func():
82 signal.signal(signal.SIGINT, signal.SIG_IGN)
---> 83 proc = Popen(command, stdin=PIPE, preexec_fn=preexec_func, env=env)
84 else:
85 # preexec_fn not supported on Windows
/opt/cloudera/parcels/Anaconda/lib/python2.7/subprocess.pyc in
init(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines,
startupinfo, creationflags)
709 p2cread, p2cwrite,
710 c2pread, c2pwrite,
--> 711 errread, errwrite)
712 except Exception:
713 # Preserve original exception in case os.close raises.
/opt/cloudera/parcels/Anaconda/lib/python2.7/subprocess.pyc in
_execute_child(self, args, executable, preexec_fn, close_fds, cwd, env, universal_newlines, startupinfo, creationflags, shell, to_close,
p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite) 1341
raise 1342 child_exception = pickle.loads(data)
-> 1343 raise child_exception 1344 1345
OSError: [Errno 2] No such file or directory
谁能帮我解决一下?发自内心的感谢。
根本原因已确定且现在有效:
"SPARK_HOME": "/opt/cloudera/parcels/SPARK2"
应替换为:
"SPARK_HOME": "/opt/cloudera/parcels/SPARK2/lib/spark2"
我现在正在努力让 Spark2.3 在 Jupyter Notebook 中工作。
目前我创建的内核如下:
- 创建环境文件:
$ cat rxie20181012-pyspark.yml name: rxie20181012-pyspark dependencies: - pyspark
- 根据环境文件创建环境
conda env create -f rxie20181012-pyspark.yml
- 激活新环境:
source activate rxie20181012-pyspark
- 基于 conda env 创建内核:
sudo ./python -m ipykernel install --name rxie20181012-pyspark --display-name "Python (rxie20181012-pyspark)"
- kernel.json如下:
cat /usr/local/share/jupyter/kernels/rxie20181012-pyspark/kernel.json
{
"display_name": "Python (rxie20181012-pyspark)",
"language": "python",
"argv": [
"/opt/cloudera/parcels/Anaconda-4.2.0/bin/python",
"-m",
"ipykernel",
"-f",
"{connection_file}"
]
}
- 注意到笔记本导入 pyspark 失败后,我将如下的 env 部分添加到 kernel.json:
{ "display_name": "Python (rxie20181012-pyspark)", "language": "python", "argv": [ "/opt/cloudera/parcels/Anaconda-4.2.0/bin/python", "-m", "ipykernel", "-f", "{connection_file}" ], "env": { "HADOOP_CONF_DIR": "/etc/spark2/conf/yarn-conf", "PYSPARK_PYTHON":"/opt/cloudera/parcels/Anaconda/bin/python", "SPARK_HOME": "/opt/cloudera/parcels/SPARK2", "PYTHONPATH": "/opt/cloudera/parcels/SPARK2/lib/spark2/python/lib/py4j-0.10.7-src.zip:/opt/cloudera/parcels/SPARK2/lib/spark2/python/", "PYTHONSTARTUP": "/opt/cloudera/parcels/SPARK2/lib/spark2/python/pyspark/shell.py", "PYSPARK_SUBMIT_ARGS": " --master yarn --deploy-mode client pyspark-shell" } }
现在导入 pyspark 时没有更多错误,但仍然无法启动 sparksession:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('abc').getOrCreate()
OSErrorTraceback (most recent call last) in () ----> 1 spark = SparkSession.builder.appName('abc').getOrCreate()
/opt/cloudera/parcels/SPARK2/lib/spark2/python/pyspark/sql/session.pyc in getOrCreate(self) 171 for key, value in self._options.items(): 172 sparkConf.set(key, value) --> 173 sc = SparkContext.getOrCreate(sparkConf) 174 # This SparkContext may be an existing one. 175 for key, value in self._options.items():
/opt/cloudera/parcels/SPARK2/lib/spark2/python/pyspark/context.pyc in getOrCreate(cls, conf) 341 with SparkContext._lock: 342 if SparkContext._active_spark_context is None: --> 343 SparkContext(conf=conf or SparkConf()) 344 return SparkContext._active_spark_context 345
/opt/cloudera/parcels/SPARK2/lib/spark2/python/pyspark/context.pyc in init(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, gateway, jsc, profiler_cls) 113 """ 114 self._callsite = first_spark_call() or CallSite(None, None, None) --> 115 SparkContext._ensure_initialized(self, gateway=gateway, conf=conf) 116 try: 117 self._do_init(master, appName, sparkHome, pyFiles, environment, batchSize, serializer,
/opt/cloudera/parcels/SPARK2/lib/spark2/python/pyspark/context.pyc in _ensure_initialized(cls, instance, gateway, conf) 290 with SparkContext._lock: 291 if not SparkContext._gateway: --> 292 SparkContext._gateway = gateway or launch_gateway(conf) 293 SparkContext._jvm = SparkContext._gateway.jvm 294
/opt/cloudera/parcels/SPARK2/lib/spark2/python/pyspark/java_gateway.pyc in launch_gateway(conf) 81 def preexec_func(): 82 signal.signal(signal.SIGINT, signal.SIG_IGN) ---> 83 proc = Popen(command, stdin=PIPE, preexec_fn=preexec_func, env=env) 84 else: 85 # preexec_fn not supported on Windows
/opt/cloudera/parcels/Anaconda/lib/python2.7/subprocess.pyc in init(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags) 709 p2cread, p2cwrite, 710 c2pread, c2pwrite, --> 711 errread, errwrite) 712 except Exception: 713 # Preserve original exception in case os.close raises.
/opt/cloudera/parcels/Anaconda/lib/python2.7/subprocess.pyc in _execute_child(self, args, executable, preexec_fn, close_fds, cwd, env, universal_newlines, startupinfo, creationflags, shell, to_close, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite) 1341
raise 1342 child_exception = pickle.loads(data) -> 1343 raise child_exception 1344 1345OSError: [Errno 2] No such file or directory
谁能帮我解决一下?发自内心的感谢。
根本原因已确定且现在有效:
"SPARK_HOME": "/opt/cloudera/parcels/SPARK2"
应替换为:
"SPARK_HOME": "/opt/cloudera/parcels/SPARK2/lib/spark2"