%%time
from pyspark.sql import SparkSession # Novo no PySpark 2.0
from pyspark.sql.types import StructType\
, StructField\
, IntegerType\
, StringType\
, DoubleType\
, DateType\
, TimestampType\
, BooleanType # Avaliar e inferir schemas
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
# NomeCampo, TipoDados, Nullable
df_schema=StructType([
StructField('ip', IntegerType(),True),
StructField('app', IntegerType(),True),
StructField('device', IntegerType(),True),
StructField('os', IntegerType(),True),
StructField('channel', IntegerType(),True),
StructField('click_time',TimestampType(),True),
StructField('attributed_time', DateType(),True),
StructField('is_attributed', IntegerType(),True)
])
# ler os dados mas com schema definido - Improvements de velocidade muito grandes
df=spark.read.csv('./datasets/train_sample.csv',header=True,schema=df_schema)
from pyspark.sql.functions import year, month, dayofmonth
df=df.withColumn("click_time_date", df["click_time"].cast(DateType()))
df=df.withColumn("click_time_year", year(df['click_time_date']))
df=df.withColumn("click_time_month", month(df['click_time_date']))
df=df.withColumn("click_time_dayofmonth", dayofmonth(df['click_time_date']))
count=df.filter("is_attributed == 0").groupBy('click_time_dayofmonth').count().collect()
x=[str(c['click_time_dayofmonth']) for c in count]
y=[c['count'] for c in count]
import matplotlib.pyplot as plt
plt.bar(x,y)
plt.show()
以上是关于JSON.parse:unexpected end of data at line 1 column 1 of the json data的主要内容,如果未能解决你的问题,请参考以下文章