Ways to create RDD in pyspark
rdd.count
rdd.take(2)
for i in rdd1.collect():
print(i)
rdd1=rdd.map(lambda x: x.upper())
x = sc.parallelize(["b", "a", "c"])
y = x.map(lambda z: (z, 1))
print(x.collect())
print(y.collect())
x = sc.parallelize([1,2,3])
y = x.filter(lambda x: x%2 == 1) #keep odd values
print(x.collect())
print(y.collect())
reduce
# reduce numbers 1 to 10 by adding them up
x = sc.parallelize([1,2,3,4,5,6,7,8,9,10], 2)
cSum = x.reduce(lambda accum, n: accum + n)
print(cSum)
# 55
# reduce numbers 1 to 10 by multiplying them
cMul = x.reduce(lambda accum, n: accum * n)
print(cMul)
# 3628800
# by defining a lambda reduce function
def cumulativeSum(accum, n):
return accum + n
cSum = x.reduce(cumulativeSum)
print(cSum)
# 55
group by
## Bazic groupByKey example in python
x = sc.parallelize([
("USA", 1), ("USA", 2), ("India", 1),
("UK", 1), ("India", 4), ("India", 9),
("USA", 8), ("USA", 3), ("India", 4),
("UK", 6), ("UK", 9), ("UK", 5)], 3)
## groupByKey with default partitions
y = x.groupByKey()
## Check partitions
print('Output: ',y.getNumPartitions())
## Output: 3
## With predefined Partitions
y = x.groupByKey(2)
print('Output: ',y.getNumPartitions())
## Output: 2
## Print Output
for t in y.collect():
print(t[0], [v for v in t[1]])
## USA [1, 2, 8, 3]
## India [1, 4, 9, 4]
## UK [1, 6, 9, 5]
- Loading an external datasets.
rdd.count
rdd.take(2)
- Paralleliz-ing a collection.
for i in rdd1.collect():
print(i)
rdd1=rdd.map(lambda x: x.upper())
x = sc.parallelize(["b", "a", "c"])
y = x.map(lambda z: (z, 1))
print(x.collect())
print(y.collect())
x = sc.parallelize([1,2,3])
y = x.filter(lambda x: x%2 == 1) #keep odd values
print(x.collect())
print(y.collect())
reduce
# reduce numbers 1 to 10 by adding them up
x = sc.parallelize([1,2,3,4,5,6,7,8,9,10], 2)
cSum = x.reduce(lambda accum, n: accum + n)
print(cSum)
# 55
# reduce numbers 1 to 10 by multiplying them
cMul = x.reduce(lambda accum, n: accum * n)
print(cMul)
# 3628800
# by defining a lambda reduce function
def cumulativeSum(accum, n):
return accum + n
cSum = x.reduce(cumulativeSum)
print(cSum)
# 55
group by
## Bazic groupByKey example in python
x = sc.parallelize([
("USA", 1), ("USA", 2), ("India", 1),
("UK", 1), ("India", 4), ("India", 9),
("USA", 8), ("USA", 3), ("India", 4),
("UK", 6), ("UK", 9), ("UK", 5)], 3)
## groupByKey with default partitions
y = x.groupByKey()
## Check partitions
print('Output: ',y.getNumPartitions())
## Output: 3
## With predefined Partitions
y = x.groupByKey(2)
print('Output: ',y.getNumPartitions())
## Output: 2
## Print Output
for t in y.collect():
print(t[0], [v for v in t[1]])
## USA [1, 2, 8, 3]
## India [1, 4, 9, 4]
## UK [1, 6, 9, 5]
pairs = lines.map(lambda x: (x.split(" ")[0], x))
No comments:
Post a Comment