Step - 1 Validate format of json file.
{
"data1": [{
"id": "10084673031_10152357071093032",
"from": {
"category": "Computers\/technology",
"category_list": [{
"id": "139823692748565",
"name": "Computers & Electronics"
}, {
"id": "177721448951559",
"name": "Workplace & Office"
}],
"name": "Cisco",
"id": "10084673031"
},
"message": "The Network team was lucky enough to speak with First Lieutenant Andrew Marsh about his return and transition to life after Afghanistan. Andrew shares his story on how an online program targeted to veterans matched his technical military skills to those required in a civilian career.",
"picture": "http:\/\/external.ak.fbcdn.net\/safe_image.php?d=AQDPRCLRHhV0HEaE&w=154&h=154&url=http\u00253A\u00252F\u00252Fimg.youtube.com\u00252Fvi\u00252FKNedz6qq45A\u00252F0.jpg",
"link": "http:\/\/newsroom.cisco.com\/video-content?type=webcontent&articleId=1281697",
"name": "Online Program Matches Veterans to Civilian Careers - The Network: Cisco's Technology News Site",
"caption": "newsroom.cisco.com",
"description": "Andrew Marsh, First Lieutenant U.S. Marine Corps : I was sent to Afghanistan in January of 2012 and I was an advisor to an Afghan army cornel. so it was my job to advise him on a daily basis. And look at what he was doing communications wise.",
"icon": "http:\/\/static.ak.fbcdn.net\/rsrc.php\/v2\/yD\/r\/aS8ecmYRys0.gif",
"privacy": {
"value": ""
},
"type": "link",
"status_type": "shared_story",
"created_time": "2013-11-11T04:04:51+0000",
"updated_time": "2013-11-11T04:06:26+0000",
"likes": {
"data": [{
"id": "100004689103296",
"name": "Salvador Lara"
}, {
"id": "1661003721",
"name": "Chad Allison"
}, {
"id": "1347829998",
"name": "Lynn Young Dasher"
}, {
"id": "100000750083815",
"name": "Shannon Vanlandingham"
}, {
"id": "100003220428749",
"name": "Tom Peters"
}, {
"id": "904540186",
"name": "Manuel van der Herder"
}, {
"id": "776497928",
"name": "Luis Chiong"
}, {
"id": "748655637",
"name": "Dave van Asch"
}, {
"id": "100005256342167",
"name": "Os Meyti"
}, {
"id": "100004025366069",
"name": "Denish Malam"
}, {
"id": "1050751758",
"name": "Jens Mueller"
}, {
"id": "506521037",
"name": "Jen Nickens"
}, {
"id": "100000723537849",
"name": "Vanya Ivanova"
}, {
"id": "100000875945917",
"name": "Danien Harper"
}, {
"id": "100000546265440",
"name": "Robert Hnatko"
}, {
"id": "656888964",
"name": "Iman Khayambashi"
}, {
"id": "100000730517023",
"name": "Debayan Deb"
}, {
"id": "1642383715",
"name": "Jonathan Schwartz"
}, {
"id": "100000865663563",
"name": "John Mahoney"
}, {
"id": "100003026212534",
"name": "Rafael Kireyev"
}, {
"id": "1490631811",
"name": "Jackie Salvatierra Estrada"
}, {
"id": "776984395",
"name": "Yuval Ariav"
}, {
"id": "100000592615966",
"name": "Ingemar Petersson"
}, {
"id": "1391145150",
"name": "Qassim Zia"
}, {
"id": "606035502793053",
"name": "Cisco Community"
}]
}
}]
}
Steps 2:- Load file in dataframe
df=spark.read.format("json").option("multiline","true").load("/dhiru/input/data2.json")
Step3:- Check schema of DF.
df.printSchema()
root
|-- data1: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- caption: string (nullable = true)
| | |-- created_time: string (nullable = true)
| | |-- description: string (nullable = true)
| | |-- from: struct (nullable = true)
| | | |-- category: string (nullable = true)
| | | |-- category_list: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- id: string (nullable = true)
| | | | | |-- name: string (nullable = true)
| | | |-- id: string (nullable = true)
| | | |-- name: string (nullable = true)
| | |-- icon: string (nullable = true)
| | |-- id: string (nullable = true)
| | |-- likes: struct (nullable = true)
| | | |-- data: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- id: string (nullable = true)
| | | | | |-- name: string (nullable = true)
| | |-- link: string (nullable = true)
| | |-- message: string (nullable = true)
| | |-- name: string (nullable = true)
| | |-- picture: string (nullable = true)
| | |-- privacy: struct (nullable = true)
| | | |-- value: string (nullable = true)
| | |-- status_type: string (nullable = true)
| | |-- type: string (nullable = true)
| | |-- updated_time: string (nullable = true)
Step4: Parse nested json file in dataframe and use explode where array has mentioned.
df.select(explode("data1").alias("data1")).\
select(col("data1.caption").alias("caption"),\
col("data1.type").alias("type"),\
col("data1.id").alias("id"),\
col("data1.from.id").alias("fromid"),\
explode("data1.from.category_list").alias("category_list"))\
.select("caption","type","id","fromid",col("category_list.id").alias("cat_id"),"category_list.name")\
.show()
Output :-
+------------------+----+--------------------+-----------+---------------+--------------------+
| caption|type| id| fromid| cat_id| name|
+------------------+----+--------------------+-----------+---------------+--------------------+
|newsroom.cisco.com|link|10084673031_10152...|10084673031|139823692748565|Computers & Elect...|
|newsroom.cisco.com|link|10084673031_10152...|10084673031|177721448951559| Workplace & Office|
+------------------+----+--------------------+-----------+---------------+--------------------+
{
"data1": [{
"id": "10084673031_10152357071093032",
"from": {
"category": "Computers\/technology",
"category_list": [{
"id": "139823692748565",
"name": "Computers & Electronics"
}, {
"id": "177721448951559",
"name": "Workplace & Office"
}],
"name": "Cisco",
"id": "10084673031"
},
"message": "The Network team was lucky enough to speak with First Lieutenant Andrew Marsh about his return and transition to life after Afghanistan. Andrew shares his story on how an online program targeted to veterans matched his technical military skills to those required in a civilian career.",
"picture": "http:\/\/external.ak.fbcdn.net\/safe_image.php?d=AQDPRCLRHhV0HEaE&w=154&h=154&url=http\u00253A\u00252F\u00252Fimg.youtube.com\u00252Fvi\u00252FKNedz6qq45A\u00252F0.jpg",
"link": "http:\/\/newsroom.cisco.com\/video-content?type=webcontent&articleId=1281697",
"name": "Online Program Matches Veterans to Civilian Careers - The Network: Cisco's Technology News Site",
"caption": "newsroom.cisco.com",
"description": "Andrew Marsh, First Lieutenant U.S. Marine Corps : I was sent to Afghanistan in January of 2012 and I was an advisor to an Afghan army cornel. so it was my job to advise him on a daily basis. And look at what he was doing communications wise.",
"icon": "http:\/\/static.ak.fbcdn.net\/rsrc.php\/v2\/yD\/r\/aS8ecmYRys0.gif",
"privacy": {
"value": ""
},
"type": "link",
"status_type": "shared_story",
"created_time": "2013-11-11T04:04:51+0000",
"updated_time": "2013-11-11T04:06:26+0000",
"likes": {
"data": [{
"id": "100004689103296",
"name": "Salvador Lara"
}, {
"id": "1661003721",
"name": "Chad Allison"
}, {
"id": "1347829998",
"name": "Lynn Young Dasher"
}, {
"id": "100000750083815",
"name": "Shannon Vanlandingham"
}, {
"id": "100003220428749",
"name": "Tom Peters"
}, {
"id": "904540186",
"name": "Manuel van der Herder"
}, {
"id": "776497928",
"name": "Luis Chiong"
}, {
"id": "748655637",
"name": "Dave van Asch"
}, {
"id": "100005256342167",
"name": "Os Meyti"
}, {
"id": "100004025366069",
"name": "Denish Malam"
}, {
"id": "1050751758",
"name": "Jens Mueller"
}, {
"id": "506521037",
"name": "Jen Nickens"
}, {
"id": "100000723537849",
"name": "Vanya Ivanova"
}, {
"id": "100000875945917",
"name": "Danien Harper"
}, {
"id": "100000546265440",
"name": "Robert Hnatko"
}, {
"id": "656888964",
"name": "Iman Khayambashi"
}, {
"id": "100000730517023",
"name": "Debayan Deb"
}, {
"id": "1642383715",
"name": "Jonathan Schwartz"
}, {
"id": "100000865663563",
"name": "John Mahoney"
}, {
"id": "100003026212534",
"name": "Rafael Kireyev"
}, {
"id": "1490631811",
"name": "Jackie Salvatierra Estrada"
}, {
"id": "776984395",
"name": "Yuval Ariav"
}, {
"id": "100000592615966",
"name": "Ingemar Petersson"
}, {
"id": "1391145150",
"name": "Qassim Zia"
}, {
"id": "606035502793053",
"name": "Cisco Community"
}]
}
}]
}
Steps 2:- Load file in dataframe
df=spark.read.format("json").option("multiline","true").load("/dhiru/input/data2.json")
Step3:- Check schema of DF.
df.printSchema()
root
|-- data1: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- caption: string (nullable = true)
| | |-- created_time: string (nullable = true)
| | |-- description: string (nullable = true)
| | |-- from: struct (nullable = true)
| | | |-- category: string (nullable = true)
| | | |-- category_list: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- id: string (nullable = true)
| | | | | |-- name: string (nullable = true)
| | | |-- id: string (nullable = true)
| | | |-- name: string (nullable = true)
| | |-- icon: string (nullable = true)
| | |-- id: string (nullable = true)
| | |-- likes: struct (nullable = true)
| | | |-- data: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- id: string (nullable = true)
| | | | | |-- name: string (nullable = true)
| | |-- link: string (nullable = true)
| | |-- message: string (nullable = true)
| | |-- name: string (nullable = true)
| | |-- picture: string (nullable = true)
| | |-- privacy: struct (nullable = true)
| | | |-- value: string (nullable = true)
| | |-- status_type: string (nullable = true)
| | |-- type: string (nullable = true)
| | |-- updated_time: string (nullable = true)
Step4: Parse nested json file in dataframe and use explode where array has mentioned.
df.select(explode("data1").alias("data1")).\
select(col("data1.caption").alias("caption"),\
col("data1.type").alias("type"),\
col("data1.id").alias("id"),\
col("data1.from.id").alias("fromid"),\
explode("data1.from.category_list").alias("category_list"))\
.select("caption","type","id","fromid",col("category_list.id").alias("cat_id"),"category_list.name")\
.show()
Output :-
+------------------+----+--------------------+-----------+---------------+--------------------+
| caption|type| id| fromid| cat_id| name|
+------------------+----+--------------------+-----------+---------------+--------------------+
|newsroom.cisco.com|link|10084673031_10152...|10084673031|139823692748565|Computers & Elect...|
|newsroom.cisco.com|link|10084673031_10152...|10084673031|177721448951559| Workplace & Office|
+------------------+----+--------------------+-----------+---------------+--------------------+
No comments:
Post a Comment