pyspark的RDD代码纪录
Posted vv.past
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了pyspark的RDD代码纪录相关的知识,希望对你有一定的参考价值。
pyspark rdd.py文件代码纪录
代码版本为 spark 2.2.0
1.RDD及常见算子
class RDD(): #这里简单介绍几个典型的算子,其余的算子代码可以自己去看一看
def __init__(self, jrdd, ctx, jrdd_deserializer=AutoBatchedSerializer(PickleSerializer())):
"""
_jrdd是个非常重要的属性,这个属性会在pyspark的计算过程中被全程传递
pyspark里被第一个建立出来的RDD往往都是通过jvm调用建立起来的数据源RDD
这个_jrdd的值就是这个jvm里对应的数据源RDD
这里需要记住,这个rdd最终在执行任务的时候被jvm执行,将数据源数据传递给python进程
"""
self._jrdd = jrdd
self.is_cached = False
self.is_checkpointed = False
self.ctx = ctx
self._jrdd_deserializer = jrdd_deserializer
self._id = jrdd.id()
self.partitioner = None
#最重要也是也是最基本的action
#其它action都是最终调用此action实现
def collect(self):
"""
返回的是一个list,所有分区的结果集
调用的是scala中对应的PythonRDD对象的collectAndServer方法触发任务的执行
collect是所有其它action动作的基础跟入口,也就是说collectAndServer是统一执行入口
"""
with SCCallSiteSync(self.context) as css:
#提交任务的时候给了一个参数,就是_jrdd对应的rdd
#这个是最初的数据源rdd或者是PythonRDD
#这里需要记住,因为当转到scala里的PythonRDD的时候就看出此处的作用了
port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
return list(_load_from_socket(port, self._jrdd_deserializer))
#reduce action
def reduce(self, f):
"""
可以看见此次最终调用的也是collect()
"""
def func(iterator):
iterator = iter(iterator)
try:
initial = next(iterator)
except StopIteration:
return
yield reduce(f, iterator, initial)
vals = self.mapPartitions(func).collect() #这里
if vals:
return reduce(f, vals)
raise ValueError("Can not reduce() empty RDD")
#这个函数是其它几个action的基础,也是调用collect实现的
def fold(self, zeroValue, op):
"""
这个函数最终调用的也是collect()来提交任务
这个函数被foreach,sum,count等action调用
"""
def func(iterator):
acc = zeroValue
for obj in iterator:
acc = op(acc, obj)
yield acc
vals = self.mapPartitions(func).collect() #这里
return reduce(op, vals, zeroValue)
def union(self, other):
"""
这个算子pyspark本地并未做过多处理,直接使用的jvm中对应的union
"""
if self._jrdd_deserializer == other._jrdd_deserializer:
rdd = RDD(self._jrdd.union(other._jrdd), self.ctx,
self._jrdd_deserializer)
else:
# These RDDs contain data in different serialized formats, so we
# must normalize them to the default serializer.
self_copy = self._reserialize()
other_copy = other._reserialize()
rdd = RDD(self_copy._jrdd.union(other_copy._jrdd), self.ctx,
self.ctx.serializer)
if (self.partitioner == other.partitioner and
self.getNumPartitions() == rdd.getNumPartitions()):
rdd.partitioner = self.partitioner
return rdd
#这个函数也很重要,如果说所有action的基础是collect
#那么所有transform的基础是这个
def mapPartitionsWithIndex(self, f, preservesPartitioning=False):
"""
这个函数被上层其它的转换算子调用
map,flatMap,mapPartitions,reduceByKey,combinerByKey等等
PipelinedRDD 是pyspark中第二个RDD类型,所有转换操作返回的类型都是这个类型
"""
return PipelinedRDD(self, f, preservesPartitioning)
def mapPartitions(self, f, preservesPartitioning=False):
"""
可以看见也是调用了mapPartitionsWithIndex实现的
这里定义的func是个关键,封装了用户的方法,在PipelinedRDD中函数被嵌套封装起来
"""
def func(s, iterator):
return f(iterator)
return self.mapPartitionsWithIndex(func, preservesPartitioning)
def flatMap(self, f, preservesPartitioning=False):
"""
可以看见同上一个函数类似
"""
def func(s, iterator):
return chain.from_iterable(map(f, iterator))
return self.mapPartitionsWithIndex(func, preservesPartitioning)
def join(self, other, numPartitions=None):
"""
join是通过调用python_join实现的,这个函数在pyspark join.py文件中实现的
join.py中的实现代码将在其它部分说明
此处只做简略说明,底层是用union和groupByKey实现的
"""
return python_join(self, other, numPartitions)
def reduceByKey(self, func, numPartitions=None, partitionFunc=portable_hash):
"""
调用的combineByKey实现的
"""
return self.combineByKey(lambda x: x, func, func, numPartitions, partitionFunc)
def combineByKey(self, createCombiner, mergeValue, mergeCombiners,
numPartitions=None, partitionFunc=portable_hash):
"""
这个函数实现逻辑是
1.用mapPartitions把本分区相同的key聚合到一起
2.然后再用partitionBy重新分区,把相同的key分到相同的分区
3.再来一次步骤1
"""
if numPartitions is None:
numPartitions = self._defaultReducePartitions()
serializer = self.ctx.serializer
memory = self._memory_limit()
agg = Aggregator(createCombiner, mergeValue, mergeCombiners)
def combineLocally(iterator):
merger = ExternalMerger(agg, memory * 0.9, serializer)
merger.mergeValues(iterator)
return merger.items()
locally_combined = self.mapPartitions(combineLocally, preservesPartitioning=True)
shuffled = locally_combined.partitionBy(numPartitions, partitionFunc)
def _mergeCombiners(iterator):
merger = ExternalMerger(agg, memory, serializer)
merger.mergeCombiners(iterator)
return merger.items()
return shuffled.mapPartitions(_mergeCombiners, preservesPartitioning=True)
2.PipelinedRDD
class PipelinedRDD(RDD):
"""
这个类是所有转换操作返回回去的RDD类型,这个类继承了RDD类
这个类重写了_jrdd属性,返回的jrdd是一个PythonRDD
PythonRDD的父rdd是最初生成的rdd中的_jrdd
也就是说,用户使用pyspark代码的时候,执行的jvm代码都是从PythonRDD开始
"""
def __init__(self, prev, func, preservesPartitioning=False):
if not isinstance(prev, PipelinedRDD) or not prev._is_pipelinable():
# 上一个rdd不是PipelinedRDD的话就把原始rdd._jrdd传递下去
self.func = func
self.preservesPartitioning = preservesPartitioning
self._prev_jrdd = prev._jrdd
self._prev_jrdd_deserializer = prev._jrdd_deserializer
else:
prev_func = prev.func
#这个函数就是把上一个rdd的逻辑和当前的处理逻辑嵌套起来
#prev_func是上一次转换时指定的函数
#func是这一次转换时指定的函数
def pipeline_func(split, iterator):
return func(split, prev_func(split, iterator))
self.func = pipeline_func
self.preservesPartitioning = prev.preservesPartitioning and preservesPartitioning
#上一个rdd是PipelinedRDD的话就把从最初rdd得到的_jrdd传递下去
self._prev_jrdd = prev._prev_jrdd
self._prev_jrdd_deserializer = prev._prev_jrdd_deserializer
self.is_cached = False
self.is_checkpointed = False
self.ctx = prev.ctx
self.prev = prev
self._jrdd_val = None
self._id = None
self._jrdd_deserializer = self.ctx.serializer
self._bypass_serializer = False
self.partitioner = prev.partitioner if self.preservesPartitioning else None
def getNumPartitions(self):
return self._prev_jrdd.partitions().size()
@property
def _jrdd(self):
"""
这里构造PythonRDD
"""
if self._jrdd_val:
return self._jrdd_val
if self._bypass_serializer:
self._jrdd_deserializer = NoOpSerializer()
if self.ctx.profiler_collector:
profiler = self.ctx.profiler_collector.new_profiler(self.ctx)
else:
profiler = None
#把用户的python代码序列化
wrapped_func = _wrap_function(self.ctx, self.func, self._prev_jrdd_deserializer,
self._jrdd_deserializer, profiler)
#构造一个新的_jrdd 类型是PythonRDD,此rdd的父rdd是最初的数据源对应的_jrdd
#当在此rdd的基础上调用action的时候,传递进去的_jrdd就是这里返回的东西
python_rdd = self.ctx._jvm.PythonRDD(self._prev_jrdd.rdd(), wrapped_func,
self.preservesPartitioning)
self._jrdd_val = python_rdd.asJavaRDD()
if profiler:
self._id = self._jrdd_val.id()
self.ctx.profiler_collector.add_profiler(self._id, profiler)
return self._jrdd_val
def id(self):
if self._id is None:
self._id = self._jrdd.id()
return self._id
def _is_pipelinable(self):
return not (self.is_cached or self.is_checkpointed)
以上是关于pyspark的RDD代码纪录的主要内容,如果未能解决你的问题,请参考以下文章
Dataframe.rdd.map().collect 在 PySpark 中不起作用 [重复]