Thursday, January 2, 2020

Parsing xml,avro,json and parquet file in spark

Parsing of different format of file like Avro,Parquet,Json,XML

=====================xml parsing====================
Note - Use xml jar file - spark-xml-0.5.0.jar
sample file:
<?xml version="1.0"?>
<catalog>
   <book id="bk101">
      <author>Gambardella, Matthew</author>
      <title>XML Developer's Guide</title>
      <genre>Computer</genre>
      <price>44.95</price>
      <publish_date>2000-10-01</publish_date>
      <description>An in-depth look at creating applications
      with XML.</description>
   </book>
   <book id="bk102">
      <author>Ralls, Kim</author>
      <title>Midnight Rain</title>
      <genre>Fantasy</genre>
      <price>5.95</price>

>>> df = spark.read.format("com.databricks.spark.xml").options(rootTag='catalog').options(rowTag='book').load("D:/Software/coding/data/sample.xml")
>>> df.show()
+-----+--------------------+--------------------+---------------+-----+------------+--------------------+
|  _id|              author|         description|          genre|price|publish_date|               title|
+-----+--------------------+--------------------+---------------+-----+------------+--------------------+
|bk101|Gambardella, Matthew|An in-depth look ...|       Computer|44.95|  2000-10-01|XML Developer's G...|
|bk102|          Ralls, Kim|A former architec...|        Fantasy| 5.95|  2000-12-16|       Midnight Rain|


#############.py file

from pyspark.sql import SparkSession
class cls:
    try :
        def meth(self):
            spark=SparkSession.builder.getOrCreate()
            df = spark.read.format("com.databricks.spark.xml").options(rootTag='catalog').options(rowTag='book').load("D:/Software/coding/data/sample.xml")
            df.select("_id","genre").show()
            print('Heeeelo')
    except IOError as e:
            print('This is error --',e)
           
           
if __name__=='__main__':
    obj=cls()
    obj.meth()

==============================parquet parsing================

>>>df = spark.read.parquet("/Software/coding/data/userdata1.parquet")
>>> df.show()
|  registration_dttm| id|first_name|last_name|               email|gender|     ip_address| 
+-------------------+---+----------+---------+--------------------+------+---------------+--
+-------------------+---+----------+---------+--------------------+------+---------------+--
|2016-02-03 13:25:29|  1|    Amanda|   Jordan|    ajordan0@com.com|Female|    1.197.201.2| 
|2016-02-03 22:34:03|  2|    Albert|  Freeman|     afreeman1@is.gd|  Male| 218.111.175.34| 

########.py

from pyspark.sql import SparkSession
class cls:
    try :
        def meth(self):
            spark=SparkSession.builder.getOrCreate()
            df=spark.read.parquet("D:/Software/coding/data/userdata1.parquet")
            df2=df.select("id","first_name","country")
            df2.write.format('csv').save('E:/Software/coding/data/writefile/data1.txt')
            #df.show()
            print('Heeeelo')
    except IOError as e:
            print('This is error --',e)
           
           
if __name__=='__main__':
    obj=cls()
    obj.meth()


===================================json  parsing============================

from pyspark.sql.functions import *
df=spark.read.format("json").option("multiline","true").load("/dhiru/input/data2.json")

df.withColumn("data1",explode("data1"))\
  .withColumn("caption",col("data1.caption"))\
  .withColumn("type",col("data1.type"))\
  .withColumn("fromid",col("data1.from.id"))\
  .withColumn("cat_list",explode("data1.from.category_list"))\
  .withColumn("cat_id",col("cat_list.id"))\
  .withColumn("cat_name",col("cat_list.name"))\
  .select("caption","type","fromid","cat_id","cat_name")\
  .where(col("cat_id")=="177721448951559")\
  .show()


Output:-------------

+------------------+----+-----------+---------------+------------------+       
|           caption|type|     fromid|         cat_id|          cat_name|
+------------------+----+-----------+---------------+------------------+
|newsroom.cisco.com|link|10084673031|177721448951559|Workplace & Office|
+------------------+----+-----------+---------------+------------------+

>>> df=spark.read.format("json").option("multiline","true").load("/dhiru/input/zipcode.json")

>>>from pyspark.sql.functions import *
>>>df=spark.read.format("json").option("multiline","true").load("/dhiru/input/data2.json")
>>>df.select(explode("data1").alias("data1")).select("data1.created_time","data1.id","data1.from").show()
+--------------------+--------------------+--------------------+
|        created_time|                  id|                from|
+--------------------+--------------------+--------------------+
|2013-11-11T04:04:...|10084673031_10152...|[Computers/techno...|
+--------------------+--------------------+--------------------+


==========================avro parsing=====================================
Note- I am using spark 2.4.4 so that I am using avro jar spark-avro_2.11-2.4.0.jar.

################.py file
from pyspark.sql import SparkSession

class avroparse:
    def avroparsing(self):
        spark=SparkSession.builder.getOrCreate()
        readfile=spark.read.format("com.databricks.spark.avro").load("D:/Software/coding/data/twitter.avro")
        readfile.show()
       
   
if __name__=='__main__':
    obj = avroparse()
    obj.avroparsing()

################Output----

+----------+--------------------+----------+
|  username|               tweet| timestamp|
+----------+--------------------+----------+
|    miguno|Rock: Nerf paper,...|1366150681|
|BlizzardCS|Works as intended...|1366154481|
+----------+--------------------+----------+

##############on pyspark

>>> readfile=spark.read.format("com.databricks.spark.avro").load("D:/Software/coding/data/twitter.avro")
>>> readfile.show()
+----------+--------------------+----------+
|  username|               tweet| timestamp|
+----------+--------------------+----------+
|    miguno|Rock: Nerf paper,...|1366150681|
|BlizzardCS|Works as intended...|1366154481|
+----------+--------------------+----------+









No comments:

Post a Comment