调用 lambda 函数时找不到模块
Posted
技术标签:
【中文标题】调用 lambda 函数时找不到模块【英文标题】:no module found when calling lambda function 【发布时间】:2018-01-23 03:58:00 【问题描述】:我正在尝试在 PySpark 1.6 上运行 python 程序。下面的脚本使用名为“dateutil”的模块将时间从一个时区转换为另一个时区。我检查了 dateutil 模块是否安装在所有工作节点和我用来提交作业的当前系统上。
执行命令:
spark-submit --packages "com.databricks:spark-csv_2.11:1.5.0" test.py
脚本:
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import HiveContext, Row, functions, SQLContext
from pyspark.sql.window import Window
import os, sys
import logging
import subprocess
import math
import csv
import functools
import pickle
from operator import add
from itertools import chain
import argparse
import hashlib
import cStringIO
import time
import copy
import datetime
import pytz
conf = SparkConf ()
conf.set('spark.kyroserializer.buffer.max', '32000')
conf.set('spark.scheduler.mode', 'FAIR')
sc = SparkContext(conf = conf, appName = "Testing dateutil...")
sqlContext = HiveContext (sc)
def utcToAESTDateString (row):
#import pytz
from dateutil import tz
utc_tz = dateutil.tz.gettz('UTC')
#utc_tz = pytz.timezone('UTC')
utc_time = datetime.datetime.fromtimestamp(int(row["start time (unix time)"].decode())).replace(tzinfo=utc_time)
#print(utc_time.strftime('%Y-%m-%d %H:%M:%S'))
aest_time = dateutil.tz.gettz('AEST')
math.acos (1)
#print(utc_time.astimezone(aest_time).strftime('%Y-%m-%d %H:%M:%S'))
#aedt_time = tz.gettz('AEDT')
#print(utc_time.astimezone(aedt_time).strftime('%Y-%m-%d %H:%M:%S'))
#return utc_time.astimezone(aedt_time).strftime('%Y-%m-%d')
return Row(sdate = unicode(utc_time.astimezone(aest_time).strftime('%Y-%m-%d'), "utf-8")) + row
sqlContext.createDataFrame(sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='false', quote='"')\
.load("/user/xxx/conviva/*")\
.filter("`start time (unix time)` <> '' AND `start time (unix time)` IS NOT NULL")\
.rdd\
.map(lambda y: utcToAESTDateString(y)))\
.registerTempTable("table1")
#sqlContext.sql ("""select * from table1 left join fixed_dart on table1.`_1` = fixed_dart.`_4` and table1.`_18` = fixed_dart.`_1`""").show()
sqlContext.sql ("""select * from table1 limit 10""").show()
错误:
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/opt/cloudera/parcels/CDH-5.8.2-1.cdh5.8.2.p0.3/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 111, in main
process()
File "/opt/cloudera/parcels/CDH-5.8.2-1.cdh5.8.2.p0.3/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 106, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/opt/cloudera/parcels/CDH-5.8.2-1.cdh5.8.2.p0.3/lib/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 263, in dump_stream
vs = list(itertools.islice(iterator, batch))
File "/opt/cloudera/parcels/CDH-5.8.2-1.cdh5.8.2.p0.3/lib/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 1293, in takeUpToNumLeft
File "/home/xxxx/test.py", line 50, in <lambda>
File "/home/xxxx/test.py", line 34, in utcToAESTDateString
NameError: global name 'dateutil' is not defined
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:166)
at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:207)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:125)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:70)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
... 1 more
【问题讨论】:
尝试把这个 import 语句放在最上面(而不是在 func 定义中) from dateutil import tz @SuryaAvala 我已经尝试了你的建议,结果是一样的。 你安装了 dateutil 吗? 我已经通过使用 pyspark 和 python 运行“import dateutil”在工作节点上检查了它,它们都可以正常工作。 啊,明白了!一旦你导入了类似 from dateutil import tz 的东西,当你再次使用 tz 时,你就不能再做 dateutil.tz 了。你可以做 tz 【参考方案1】:更改这些行utc_tz = tz.gettz('UTC')
和aest_time = tz.gettz('AEST')
因此,当您像这样导入特定方法时:
from dateutil import tz
你不能像dateutil.tz
那样调用函数
你必须做tz()
。
你的代码应该如下:
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import HiveContext, Row, functions, SQLContext
from pyspark.sql.window import Window
import os, sys
import logging
import subprocess
import math
import csv
import functools
import pickle
from operator import add
from itertools import chain
import argparse
import hashlib
import cStringIO
import time
import copy
import datetime
import pytz
conf = SparkConf ()
conf.set('spark.kyroserializer.buffer.max', '32000')
conf.set('spark.scheduler.mode', 'FAIR')
sc = SparkContext(conf = conf, appName = "Testing dateutil...")
sqlContext = HiveContext (sc)
def utcToAESTDateString (row):
#import pytz
from dateutil import tz
utc_tz = tz.gettz('UTC')
#utc_tz = pytz.timezone('UTC')
utc_time = datetime.datetime.fromtimestamp(int(row["start time (unix time)"].decode())).replace(tzinfo=utc_time)
#print(utc_time.strftime('%Y-%m-%d %H:%M:%S'))
aest_time = tz.gettz('AEST')
math.acos (1)
#print(utc_time.astimezone(aest_time).strftime('%Y-%m-%d %H:%M:%S'))
#aedt_time = tz.gettz('AEDT')
#print(utc_time.astimezone(aedt_time).strftime('%Y-%m-%d %H:%M:%S'))
#return utc_time.astimezone(aedt_time).strftime('%Y-%m-%d')
return Row(sdate = unicode(utc_time.astimezone(aest_time).strftime('%Y-%m-%d'), "utf-8")) + row
sqlContext.createDataFrame(sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='false', quote='"')\
.load("/user/xxx/conviva/*")\
.filter("`start time (unix time)` <> '' AND `start time (unix time)` IS NOT NULL")\
.rdd\
.map(lambda y: utcToAESTDateString(y)))\
.registerTempTable("table1")
#sqlContext.sql ("""select * from table1 left join fixed_dart on table1.`_1` = fixed_dart.`_4` and table1.`_18` = fixed_dart.`_1`""").show()
sqlContext.sql ("""select * from table1 limit 10""").show()
【讨论】:
我仍然有问题。这是错误:File "/home/dat.nguyen/test.py", line 32, in utcToAESTDateString ImportError: No module named dateutil
你的第 32 行是什么?是这个 utc_tz = tz.gettz('UTC') 吗?以上是关于调用 lambda 函数时找不到模块的主要内容,如果未能解决你的问题,请参考以下文章
在服务器上调用 PHP 函数但在本地服务器上工作时找不到 JS 函数