from pyspark.sql.functions import *
df=spark.read.format("json").option("multiline","true").load("/dhiru/input/data2.json")
df.select(explode("data1").alias("data1")).select("data1.created_time","data1.id","data1.from").show()
+--------------------+--------------------+--------------------+
| created_time| id| from|
+--------------------+--------------------+--------------------+
|2013-11-11T04:04:...|10084673031_10152...|[Computers/techno...|
+--------------------+--------------------+--------------------+
==> Key point When you find array in schema please use explode function at there
df.select(explode("data1").alias("data1")).select("data1.created_time","data1.id","data1.description","data1.from").show()
>>> df.select(explode("data1").alias("data1")).select("data1.caption","data1.from.id",explode("data1.from.category_list").alias("category_list")).show()
+------------------+-------------+--------------------+
| data1.caption|data1.from.id| category_list|
+------------------+-------------+--------------------+
|newsroom.cisco.com| 10084673031|[139823692748565,...|
|newsroom.cisco.com| 10084673031|[177721448951559,...|
+------------------+-------------+--------------------+
>>> df.select(explode("data1").alias("data1")).select("data1.*",explode("data1.from.category_list").alias("category_list")).show()
+------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------+------------+----+--------------------+--------------------+
| caption| created_time| description| from| icon| id| likes| link| message| name| picture|privacy| status_type|type| updated_time| category_list|
+------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------+------------+----+--------------------+--------------------+
|newsroom.cisco.com|2013-11-11T04:04:...|Andrew Marsh, Fir...|[Computers/techno...|http://static.ak....|10084673031_10152...|[[[10000468910329...|http://newsroom.c...|The Network team ...|Online Program Ma...|http://external.a...| []|shared_story|link|2013-11-11T04:06:...|[139823692748565,...|
|newsroom.cisco.com|2013-11-11T04:04:...|Andrew Marsh, Fir...|[Computers/techno...|http://static.ak....|10084673031_10152...|[[[10000468910329...|http://newsroom.c...|The Network team ...|Online Program Ma...|http://external.a...| []|shared_story|link|2013-11-11T04:06:...|[177721448951559,...|
+------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------+------------+----+--------------------+--------------------+
df.select(explode("data1").alias("data1")).select("data1.caption","data1.from.id",explode("data1.from.category_list").alias("category_list")).select(col("*"),col("category_list.*")).show()
+------------------+-------------+--------------------+---------------+--------------------+
| data1.caption|data1.from.id| category_list| id| name|
+------------------+-------------+--------------------+---------------+--------------------+
|newsroom.cisco.com| 10084673031|[139823692748565,...|139823692748565|Computers & Elect...|
|newsroom.cisco.com| 10084673031|[177721448951559,...|177721448951559| Workplace & Office|
+------------------+-------------+--------------------+---------------+--------------------+
df.select(explode("data1").alias("data1")).\
select(col("data1.caption").alias("caption"),\
col("data1.type").alias("type"),\
col("data1.id").alias("id"),\
col("data1.from.id").alias("fromid"),\
explode("data1.from.category_list").alias("category_list")).show()
+------------------+----+--------------------+-----------+--------------------+
| caption|type| id| fromid| category_list|
+------------------+----+--------------------+-----------+--------------------+
|newsroom.cisco.com|link|10084673031_10152...|10084673031|[139823692748565,...|
|newsroom.cisco.com|link|10084673031_10152...|10084673031|[177721448951559,...|
+------------------+----+--------------------+-----------+--------------------+
store df in df2
>>> df2=df.select(explode("data1").alias("data1")).\
... select(col("data1.caption").alias("caption"),\
... col("data1.type").alias("type"),\
... col("data1.id").alias("id"),\
... col("data1.from.id").alias("fromid"),\
... explode("data1.from.category_list").alias("category_list"))
>>> df2.printSchema()
root
|-- caption: string (nullable = true)
|-- type: string (nullable = true)
|-- id: string (nullable = true)
|-- fromid: string (nullable = true)
|-- category_list: struct (nullable = true)
| |-- id: string (nullable = true)
| |-- name: string (nullable = true)
>>> df2.select("caption","type","id","fromid","category_list.id","category_list.name").show()
+------------------+----+--------------------+-----------+---------------+--------------------+
| caption|type| id| fromid| id| name|
+------------------+----+--------------------+-----------+---------------+--------------------+
|newsroom.cisco.com|link|10084673031_10152...|10084673031|139823692748565|Computers & Elect...|
|newsroom.cisco.com|link|10084673031_10152...|10084673031|177721448951559| Workplace & Office|
+------------------+----+--------------------+-----------+---------------+--------------------+
>>> df2.select("caption","type","id","fromid","category_list.id","category_list.name").where(df2.category_list.id=="139823692748565").show()
+------------------+----+--------------------+-----------+---------------+--------------------+
| caption|type| id| fromid| id| name|
+------------------+----+--------------------+-----------+---------------+--------------------+
|newsroom.cisco.com|link|10084673031_10152...|10084673031|139823692748565|Computers & Elect...|
+------------------+----+--------------------+-----------+---------------+--------------------+
df=spark.read.format("json").option("multiline","true").load("/dhiru/input/data2.json")
df.select(explode("data1").alias("data1")).select("data1.created_time","data1.id","data1.from").show()
+--------------------+--------------------+--------------------+
| created_time| id| from|
+--------------------+--------------------+--------------------+
|2013-11-11T04:04:...|10084673031_10152...|[Computers/techno...|
+--------------------+--------------------+--------------------+
==> Key point When you find array in schema please use explode function at there
df.select(explode("data1").alias("data1")).select("data1.created_time","data1.id","data1.description","data1.from").show()
>>> df.select(explode("data1").alias("data1")).select("data1.caption","data1.from.id",explode("data1.from.category_list").alias("category_list")).show()
+------------------+-------------+--------------------+
| data1.caption|data1.from.id| category_list|
+------------------+-------------+--------------------+
|newsroom.cisco.com| 10084673031|[139823692748565,...|
|newsroom.cisco.com| 10084673031|[177721448951559,...|
+------------------+-------------+--------------------+
>>> df.select(explode("data1").alias("data1")).select("data1.*",explode("data1.from.category_list").alias("category_list")).show()
+------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------+------------+----+--------------------+--------------------+
| caption| created_time| description| from| icon| id| likes| link| message| name| picture|privacy| status_type|type| updated_time| category_list|
+------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------+------------+----+--------------------+--------------------+
|newsroom.cisco.com|2013-11-11T04:04:...|Andrew Marsh, Fir...|[Computers/techno...|http://static.ak....|10084673031_10152...|[[[10000468910329...|http://newsroom.c...|The Network team ...|Online Program Ma...|http://external.a...| []|shared_story|link|2013-11-11T04:06:...|[139823692748565,...|
|newsroom.cisco.com|2013-11-11T04:04:...|Andrew Marsh, Fir...|[Computers/techno...|http://static.ak....|10084673031_10152...|[[[10000468910329...|http://newsroom.c...|The Network team ...|Online Program Ma...|http://external.a...| []|shared_story|link|2013-11-11T04:06:...|[177721448951559,...|
+------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------+------------+----+--------------------+--------------------+
df.select(explode("data1").alias("data1")).select("data1.caption","data1.from.id",explode("data1.from.category_list").alias("category_list")).select(col("*"),col("category_list.*")).show()
+------------------+-------------+--------------------+---------------+--------------------+
| data1.caption|data1.from.id| category_list| id| name|
+------------------+-------------+--------------------+---------------+--------------------+
|newsroom.cisco.com| 10084673031|[139823692748565,...|139823692748565|Computers & Elect...|
|newsroom.cisco.com| 10084673031|[177721448951559,...|177721448951559| Workplace & Office|
+------------------+-------------+--------------------+---------------+--------------------+
df.select(explode("data1").alias("data1")).\
select(col("data1.caption").alias("caption"),\
col("data1.type").alias("type"),\
col("data1.id").alias("id"),\
col("data1.from.id").alias("fromid"),\
explode("data1.from.category_list").alias("category_list")).show()
+------------------+----+--------------------+-----------+--------------------+
| caption|type| id| fromid| category_list|
+------------------+----+--------------------+-----------+--------------------+
|newsroom.cisco.com|link|10084673031_10152...|10084673031|[139823692748565,...|
|newsroom.cisco.com|link|10084673031_10152...|10084673031|[177721448951559,...|
+------------------+----+--------------------+-----------+--------------------+
store df in df2
>>> df2=df.select(explode("data1").alias("data1")).\
... select(col("data1.caption").alias("caption"),\
... col("data1.type").alias("type"),\
... col("data1.id").alias("id"),\
... col("data1.from.id").alias("fromid"),\
... explode("data1.from.category_list").alias("category_list"))
>>> df2.printSchema()
root
|-- caption: string (nullable = true)
|-- type: string (nullable = true)
|-- id: string (nullable = true)
|-- fromid: string (nullable = true)
|-- category_list: struct (nullable = true)
| |-- id: string (nullable = true)
| |-- name: string (nullable = true)
>>> df2.select("caption","type","id","fromid","category_list.id","category_list.name").show()
+------------------+----+--------------------+-----------+---------------+--------------------+
| caption|type| id| fromid| id| name|
+------------------+----+--------------------+-----------+---------------+--------------------+
|newsroom.cisco.com|link|10084673031_10152...|10084673031|139823692748565|Computers & Elect...|
|newsroom.cisco.com|link|10084673031_10152...|10084673031|177721448951559| Workplace & Office|
+------------------+----+--------------------+-----------+---------------+--------------------+
>>> df2.select("caption","type","id","fromid","category_list.id","category_list.name").where(df2.category_list.id=="139823692748565").show()
+------------------+----+--------------------+-----------+---------------+--------------------+
| caption|type| id| fromid| id| name|
+------------------+----+--------------------+-----------+---------------+--------------------+
|newsroom.cisco.com|link|10084673031_10152...|10084673031|139823692748565|Computers & Elect...|
+------------------+----+--------------------+-----------+---------------+--------------------+
ReplyDeleteHi there, your site your blog is amazing. I found some T-shirts and have to say they are very unique and are of good quality. Literally one of the best stuffs on the internet think you can improve your service by providing shirts with content on them for e.g. as I am a developer, I would like to buy shirt with JSON VIEWER, or tools relevant to it. For more detail visit our site jsononline
Thank you so much for your feedback.
ReplyDelete