from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext
def processToRDD(line):
f = line.split('|')
return f
rd_nation=sc.textFile("/databricks-datasets/tpch/data-001/nation/").map(processToRDD)
rd_region=sc.textFile("/databricks-datasets/tpch/data-001/region/").map(processToRDD)
join_nation_region=rd_nation.join(rd_region)
join_nation_region.collect()
No comments:
Post a Comment