Sunday, September 20, 2020

Pyspark find special character in json

Case - We have json file and we have to find hashtag('#') which exist with existing column(comments)

[{

  "id": 1,

  "first_name": "Jeanette",

  "last_name": "Penddreth",

  "email": "jpenddreth0@census@.gov",

  "comments": "#This is sample jason file# which used for to count # tag",

  "ip_address": "26.58.193.2"

}, {

  "id": 2,

  "first_name": "Giavani",

  "last_name": "Frediani",

  "email": "gfrediani1senate.gov",

  "comments": "Did we find any # tag",

  "ip_address": "229.179.4.212"

}]

Solution 1:

df=spark.read.format('json').option('multiline','true').load('D:\Documents\data\sample.json')

df.withColumn('cnt', size(split(df.comments,"#"))-1).show()


Result:


Solution 2:

from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, size, split,udf

spark=SparkSession.builder.getOrCreate()

@udf(returnType='int')
def chr(str):
    cnt=0
    for i in str:
        if i=='#':
           cnt+=1
    return cnt        

def chrcnt():
    df=spark.read.format('json').option('multiline','true').load('D:\Documents\data\sample.json')
    res=df.withColumn('cnt',chr(df.comments))
    print(res.show())
    
chrcnt()

Solution 3: Using count() function of python

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, size, split,udf

spark=SparkSession.builder.getOrCreate()

@udf(returnType='int')
def chr(str):
    return str.count('#')

def chrcnt():
    df=spark.read.format('json').option('multiline','true').load('D:\Documents\data\sample.json')
    res=df.withColumn('cnt',chr(df.comments))
    print(res.show())
    
    
chrcnt()

No comments:

Post a Comment