Thursday, October 1, 2020

RDD join operations

 from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

sc = spark.sparkContext


def processToRDD(line):

    f = line.split('|')

    return f

  

rd_nation=sc.textFile("/databricks-datasets/tpch/data-001/nation/").map(processToRDD)

rd_region=sc.textFile("/databricks-datasets/tpch/data-001/region/").map(processToRDD)


join_nation_region=rd_nation.join(rd_region)

join_nation_region.collect()

No comments:

Post a Comment