Spark Solution By Dheerendra Yadav: Pyspark with function argument,grouBy,limit

Thursday, March 19, 2020

Pyspark with function argument,grouBy,limit

Use Case -1

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from unicodedata import category

class spark():
def schema1(self):
schema1=StructType([StructField('name',StringType(),True),
StructField('age',IntegerType(),True),
StructField('city',StringType(),True),
StructField('state',StringType(),True),
StructField('category',StringType(),True),
StructField('st_date',DateType(),True)])
return schema1

def fun(self,cat):
spark=SparkSession.builder.getOrCreate()
rfile=spark.read.format('csv').load('D:\Documents\data\patient_data.csv',schema=self.schema1())
data1=rfile.groupBy('category','state','city','st_date').count().withColumnRenamed('count','cnt').orderBy(desc('cnt'))
#data1.where("st_date='{}'".format(date1)).show()
print("{}".format(cat))
data1.where("category='{}'".format(cat)).show()

obj=spark()
obj.fun(cat='A')
#obj.fun(date1='2020-03-18')

##Use Case -2 function SUM, COUNT, OrderBy, OrderBy in single pipeline

cat_sch=StructType([StructField("cate", StringType(),True),

StructField("num",IntegerType(),True)])

rfile=spark.read.format('csv').load('D:/Documents/data/sample.csv',schema=cat_sch)

df2=rfile.groupBy('cate').agg(count('cate').alias('cnt'),sum('num').alias('cat_sum')).orderBy(desc('cnt'))

df2.show()