Use Case -1
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from unicodedata import category
class spark():
def schema1(self):
schema1=StructType([StructField('name',StringType(),True),
StructField('age',IntegerType(),True),
StructField('city',StringType(),True),
StructField('state',StringType(),True),
StructField('category',StringType(),True),
StructField('st_date',DateType(),True)])
return schema1
def fun(self,cat):
spark=SparkSession.builder.getOrCreate()
rfile=spark.read.format('csv').load('D:\Documents\data\patient_data.csv',schema=self.schema1())
data1=rfile.groupBy('category','state','city','st_date').count().withColumnRenamed('count','cnt').orderBy(desc('cnt'))
#data1.where("st_date='{}'".format(date1)).show()
print("{}".format(cat))
data1.where("category='{}'".format(cat)).show()
obj=spark()
obj.fun(cat='A')
#obj.fun(date1='2020-03-18')
from pyspark.sql.functions import *
from pyspark.sql.types import *
from unicodedata import category
class spark():
def schema1(self):
schema1=StructType([StructField('name',StringType(),True),
StructField('age',IntegerType(),True),
StructField('city',StringType(),True),
StructField('state',StringType(),True),
StructField('category',StringType(),True),
StructField('st_date',DateType(),True)])
return schema1
def fun(self,cat):
spark=SparkSession.builder.getOrCreate()
rfile=spark.read.format('csv').load('D:\Documents\data\patient_data.csv',schema=self.schema1())
data1=rfile.groupBy('category','state','city','st_date').count().withColumnRenamed('count','cnt').orderBy(desc('cnt'))
#data1.where("st_date='{}'".format(date1)).show()
print("{}".format(cat))
data1.where("category='{}'".format(cat)).show()
obj=spark()
obj.fun(cat='A')
#obj.fun(date1='2020-03-18')
##Use Case -2 function SUM, COUNT, OrderBy, OrderBy in single pipeline
cat_sch=StructType([StructField("cate", StringType(),True),
StructField("num",IntegerType(),True)])
rfile=spark.read.format('csv').load('D:/Documents/data/sample.csv',schema=cat_sch)
df2=rfile.groupBy('cate').agg(count('cate').alias('cnt'),sum('num').alias('cat_sum')).orderBy(desc('cnt'))
df2.show()
Excellent Blog, I like your blog and It is very informative. Thank you
ReplyDeletePyspark online Training
Learn Pyspark Online