Thursday, October 1, 2020

Dataframe parsing log file

 Question #1: Parsing Logs


import pandas as pd

from pyspark.sql import SparkSession

from pyspark import SparkContext

from pyspark.sql.functions import split,col

spark = SparkSession.builder.getOrCreate()


rfile=spark.read.format('text').option('delimiter',' ').load("/databricks-datasets/learning-spark/data-001/fake_logs/log1.log")

df=rfile.withColumn("col1", split(col("value"), '"').getItem(0))\

      .withColumn("col2", split(col("value"), '"').getItem(1))\

      .withColumn("col3", split(col("value"), '"').getItem(2))\

      .withColumn("col3_1", split(col("col3"), ' ').getItem(1))\

      .withColumn("col3_2", split(col("col3"), ' ').getItem(2))\

      .withColumn("col4", split(col("value"), '"').getItem(2))\

      .withColumn("col5", split(col("value"), '"').getItem(3))\

      .withColumn("col5", split(col("value"), '"').getItem(4))\

      .withColumn("browser", split(col("value"), '"').getItem(5))\

      .withColumn("col8", split(col("value"), '"').getItem(6))\

      .withColumn("col9", split(col("value"), '"').getItem(7))\

      .withColumn("col10", split(col("value"),'"').getItem(8))\

      .withColumn("col11", split(col("value"), '"').getItem(9))\

      .withColumn("col12", split(col("value"), '"').getItem(10))


df.createOrReplaceTempView('log_table')

spark.sql("select browser,col3_1,count(col3_1) from log_table group by browser,col3_1").show()

No comments:

Post a Comment