Question #1: Parsing Logs
import pandas as pd
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.functions import split,col
spark = SparkSession.builder.getOrCreate()
rfile=spark.read.format('text').option('delimiter',' ').load("/databricks-datasets/learning-spark/data-001/fake_logs/log1.log")
df=rfile.withColumn("col1", split(col("value"), '"').getItem(0))\
.withColumn("col2", split(col("value"), '"').getItem(1))\
.withColumn("col3", split(col("value"), '"').getItem(2))\
.withColumn("col3_1", split(col("col3"), ' ').getItem(1))\
.withColumn("col3_2", split(col("col3"), ' ').getItem(2))\
.withColumn("col4", split(col("value"), '"').getItem(2))\
.withColumn("col5", split(col("value"), '"').getItem(3))\
.withColumn("col5", split(col("value"), '"').getItem(4))\
.withColumn("browser", split(col("value"), '"').getItem(5))\
.withColumn("col8", split(col("value"), '"').getItem(6))\
.withColumn("col9", split(col("value"), '"').getItem(7))\
.withColumn("col10", split(col("value"),'"').getItem(8))\
.withColumn("col11", split(col("value"), '"').getItem(9))\
.withColumn("col12", split(col("value"), '"').getItem(10))
df.createOrReplaceTempView('log_table')
spark.sql("select browser,col3_1,count(col3_1) from log_table group by browser,col3_1").show()
No comments:
Post a Comment