大数据(7i)比较Python和Scala的collection常用方法
Posted 小基基o_O
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了大数据(7i)比较Python和Scala的collection常用方法相关的知识,希望对你有一定的参考价值。
常用方法
val a = Range(1,9).toList
// 长度
a.length
// 是否包含:false
a.contains(9)
// 生成字符串
a.mkString("-")
// 迭代器
a.iterator
// 反转
a.reverse
// 拉链:List((1,1), (2,2), (3,3), (4,4), (5,5), (6,6), (7,7), (8,8))
a.zip(a)
// 滑窗:List(List(1, 2, 3), List(3, 4, 5), List(5, 6, 7), List(7, 8))
a.sliding(3,2).toSeq
// 排序
a.sorted
// 切片:List(3, 4, 5)
a.slice(2, 5)
// 计算
println(a.sum,a.product,a.max,a.min)
过滤(filter)
a = [1, 'a', 2, 'b']
b = filter(lambda x: isinstance(x, int), a)
print(list(b)) # [1, 2]
val a = List(1,"a",2,"b")
val b = a.filter(_.isInstanceOf[Int])
println(b) // List(1, 2)
映射(map)
a = ['cat dog', 'cat cow', 'cat dog cow']
a = map(lambda i: i.split(), a) # 或者 [i.split() for i in a]
print(list(a)) # [['cat', 'dog'], ['cat', 'cow'], ['cat', 'dog', 'cow']]
val a = List("cat dog","cat cow","cat dog cow")
val b = a.map(_.split(" +").toList) // split支持正则
print(b) // List(List(cat, dog), List(cat, cow), List(cat, dog, cow))
扁平化(flatten)
from numpy import array
a = array([[1, 2, 3], [4, 5, 6]])
print(a.flatten()) # [1 2 3 4 5 6]
print(a.flatten('F')) # [1 4 2 5 3 6]
val a = Seq(Seq(1,2,3),Seq(4,5,6))
println(a.flatten) // List(1, 2, 3, 4, 5, 6)
映射后扁平化(flatMap)
a = ['cat dog', 'cat cow', 'cat dog cow']
b = [w for t in a for w in t.split()]
print(b) # ['cat', 'dog', 'cat', 'cow', 'cat', 'dog', 'cow']
val a = List("cat dog","cat cow","cat dog cow")
val b = a.flatMap(_.split(" +").toList)
println(b) // List(cat, dog, cat, cow, cat, dog, cow)
分组(group)
dt = dict()
for i in range(9):
k = i % 3
if k not in dt:
dt[k] = [i]
else:
dt[k].append(i)
print(dt)
# {0: [0, 3, 6], 1: [1, 4, 7], 2: [2, 5, 8]}
val r = Range(0,9)
println(r.groupBy(_%3))
// HashMap(0 -> Vector(0, 3, 6), 1 -> Vector(1, 4, 7), 2 -> Vector(2, 5, 8))
归约(reduce)
from functools import reduce
a = [4, 3, 2, 1]
b = reduce(lambda x, y: x - y, a)
print(b) # 4-3-2-1=-2
val a = List(4,3,2,1)
val b = a.reduce((a,b)=>a-b)
println(b) // 4-3-2-1=2
折叠(fold)
def fold(head):
def reduce(func, iterator):
i0 = head
for i1 in iterator:
i0 = func(i0, i1)
return i0
return reduce
a = [4, 3, 2, 1]
b = fold(10)(lambda x, y: x - y, a)
print(b) # 0=10-4-3-2-1
Scala的
fold
使用了柯里化
val a = List(4,3,2,1)
val b = a.fold(10)(_-_)
print(b) // 0=10-4-3-2-1
两个Map合并
合并,后者 value 覆盖前者
import scala.collection.mutable
val m1 = mutable.Map("c"->3,"b"->2)
val m2 = mutable.Map("c"->5,"d"->4)
m1 ++= m2
println(m1) // Map(b -> 2, d -> 4, c -> 5)
合并,value 相加
import scala.collection.mutable
val m1 = mutable.Map("c" -> 3, "b" -> 2)
val m2 = mutable.Map("c" -> 5, "d" -> 4)
//m2.foreach(t => m1.update(t._1, m1.getOrElse(t._1, 0) + t._2))
m1 ++= m2.map(t => t._1 -> (t._2 + m1.getOrElse(t._1, 0)))
println(m1) // Map(b -> 2, d -> 4, c -> 8)
词频统计
from collections import Counter
a = ['cat dog', 'cat cow', 'cat dog cow']
c = Counter(w for t in a for w in t.split())
print(c.most_common()) # [('cat', 3), ('dog', 2), ('cow', 2)]
val a1 = List("cat dog", "cat cow", "cat dog cow")
// List(cat dog, cat cow, cat dog cow)
val a2 = a1.map(_.split(" +").toList)
// List(List(cat, dog), List(cat, cow), List(cat, dog, cow))
val a3 = a2.flatten
// List(cat, dog, cat, cow, cat, dog, cow)
val a4 = a3.groupBy(e=>e)
// HashMap(cow -> List(cow, cow), dog -> List(dog, dog), cat -> List(cat, cat, cat))
val a5 = a4.map(kv=>(kv._1,kv._2.length))
// HashMap(cow -> 2, dog -> 2, cat -> 3)
val a6 = a5.toList.sortBy(_._2).reverse
// List((cat,3), (dog,2), (cow,2))
带权重词频统计
a = [("cat dog", 3), ("cat pig", 2), ("pig dog", 1)]
c = dict()
for t, f in a:
for w in t.split():
c[w] = c.get(w, 0) + f
print(sorted(c.items(), key=lambda p: p[1], reverse=True))
import scala.collection.mutable
val a = List(("cat dog", 3), ("cat pig", 2), ("pig dog", 1))
val counter: mutable.Map[String, Int] = mutable.Map()
for ((text, count) <- a) {
for (word <- text.split(" ")) {
counter(word) = counter.getOrElse(word, 0) + count
}
}
print(counter.toList.sortBy(_._2).reverse) // List((cat,5), (dog,4), (pig,3))
以上是关于大数据(7i)比较Python和Scala的collection常用方法的主要内容,如果未能解决你的问题,请参考以下文章