Case - We have json file and we have to find hashtag('#') which exist with existing column(comments)
[{
"id": 1,
"first_name": "Jeanette",
"last_name": "Penddreth",
"email": "jpenddreth0@census@.gov",
"comments": "#This is sample jason file# which used for to count # tag",
"ip_address": "26.58.193.2"
}, {
"id": 2,
"first_name": "Giavani",
"last_name": "Frediani",
"email": "gfrediani1senate.gov",
"comments": "Did we find any # tag",
"ip_address": "229.179.4.212"
}]
Solution 1:
df=spark.read.format('json').option('multiline','true').load('D:\Documents\data\sample.json')
df.withColumn('cnt', size(split(df.comments,"#"))-1).show()
Result:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, size, split,udf
spark=SparkSession.builder.getOrCreate()
@udf(returnType='int')
def chr(str):
cnt=0
for i in str:
if i=='#':
cnt+=1
return cnt
def chrcnt():
df=spark.read.format('json').option('multiline','true').load('D:\Documents\data\sample.json')
res=df.withColumn('cnt',chr(df.comments))
print(res.show())
chrcnt()
Solution 3: Using count() function of python
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, size, split,udf
spark=SparkSession.builder.getOrCreate()
@udf(returnType='int')
def chr(str):
return str.count('#')
def chrcnt():
df=spark.read.format('json').option('multiline','true').load('D:\Documents\data\sample.json')
res=df.withColumn('cnt',chr(df.comments))
print(res.show())
chrcnt()
No comments:
Post a Comment