Basic RDD operations in PySpark

Ways to create RDD in pyspark
  • Loading an external datasets.
       rdd = sc.textFile("abc.txt")
       rdd.count
       rdd.take(2)
  • Paralleliz-ing a collection.
        lines = sc.parallelize(["pandas", "i like pandas"])

for i in rdd1.collect():
      print(i)

rdd1=rdd.map(lambda x: x.upper())


x = sc.parallelize(["b", "a", "c"])
y = x.map(lambda z: (z, 1))
print(x.collect())
print(y.collect())

x = sc.parallelize([1,2,3])
y = x.filter(lambda x: x%2 == 1) #keep odd values
print(x.collect())
print(y.collect())

reduce
# reduce numbers 1 to 10 by adding them up
x = sc.parallelize([1,2,3,4,5,6,7,8,9,10], 2)
cSum = x.reduce(lambda accum, n: accum + n)
print(cSum)
# 55

# reduce numbers 1 to 10 by multiplying them
cMul = x.reduce(lambda accum, n: accum * n)
print(cMul)
# 3628800

# by defining a lambda reduce function
def cumulativeSum(accum, n):
    return accum + n

cSum = x.reduce(cumulativeSum)
print(cSum)
# 55

group by 
## Bazic groupByKey example in python
x = sc.parallelize([
    ("USA", 1), ("USA", 2), ("India", 1),
    ("UK", 1), ("India", 4), ("India", 9),
    ("USA", 8), ("USA", 3), ("India", 4),
    ("UK", 6), ("UK", 9), ("UK", 5)], 3)

## groupByKey with default partitions
y = x.groupByKey()

## Check partitions
print('Output: ',y.getNumPartitions())
## Output: 3

## With predefined Partitions
y = x.groupByKey(2)
print('Output: ',y.getNumPartitions())
## Output: 2

## Print Output
for t in y.collect():
    print(t[0], [v for v in t[1]])

## USA [1, 2, 8, 3]
## India [1, 4, 9, 4]
## UK [1, 6, 9, 5]

pairs = lines.map(lambda x: (x.split(" ")[0], x))

No comments:

Post a Comment