python数据分析-07时间序列处理
Posted nikecode
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python数据分析-07时间序列处理相关的知识,希望对你有一定的参考价值。
import pandas as pd
import numpy as np
from pandas import Series,DataFrame
#时间序列的操作基础
from datetime import datetime
"""
t = datetime(2016,9,10)
print(t)#2016-09-10 00:00:00
data_list = [
datetime(2016,9,1),
datetime(2016,9,10),
datetime(2017,9,1),
datetime(2017,9,20),
datetime(2017,10,1)
]
print(data_list)
# [datetime.datetime(2016, 9, 1, 0, 0), datetime.datetime(2016, 9, 10, 0, 0), datetime.datetime(2017, 9, 1, 0, 0), datetime.datetime(2017, 9, 20, 0, 0), datetime.datetime(2017, 10, 1, 0, 0)]
s1 = Series(np.random.rand(5),index=data_list)
print(s1)
# 2016-09-01 0.437216
# 2016-09-10 0.002021
# 2017-09-01 0.990085
# 2017-09-20 0.635123
# 2017-10-01 0.504584
# dtype: float64
print(s1.values)#[0.74523743 0.67846232 0.33464572 0.66881491 0.34169192]
print(s1.index)
# DatetimeIndex([‘2016-09-01‘, ‘2016-09-10‘, ‘2017-09-01‘, ‘2017-09-20‘,
# ‘2017-10-01‘],
# dtype=‘datetime64[ns]‘, freq=None)
print(s1[1])#0.3247607714134729
print(s1[datetime(2016,9,10)])#0.3247607714134729
print(s1[‘2016-09-10‘])#0.3247607714134729
print(s1[‘20160910‘])#0.3247607714134729
print(s1[‘2016-09‘])
# 2016-09-01 0.713300
# 2016-09-10 0.265708
# dtype: float64
print(s1[‘2016‘])
# 2016-09-01 0.139233
# 2016-09-10 0.595806
# dtype: float64
"""
"""
data_list_new = pd.date_range("2016-01-01",periods=100,freq="5H")#表示每5小时生成一个时间,一共生成100个
print(data_list_new)
# DatetimeIndex([‘2016-01-01 00:00:00‘, ‘2016-01-01 05:00:00‘,
# ‘2016-01-01 10:00:00‘, ‘2016-01-01 15:00:00‘,
# ‘2016-01-01 20:00:00‘, ‘2016-01-02 01:00:00‘,
# ‘2016-01-02 06:00:00‘, ‘2016-01-02 11:00:00‘,
# ‘2016-01-02 16:00:00‘, ‘2016-01-02 21:00:00‘,
# ‘2016-01-03 02:00:00‘, ‘2016-01-03 07:00:00‘,
# ‘2016-01-03 12:00:00‘, ‘2016-01-03 17:00:00‘,
# ‘2016-01-03 22:00:00‘, ‘2016-01-04 03:00:00‘,
# ‘2016-01-04 08:00:00‘, ‘2016-01-04 13:00:00‘,
# ‘2016-01-04 18:00:00‘, ‘2016-01-04 23:00:00‘,
# ‘2016-01-05 04:00:00‘, ‘2016-01-05 09:00:00‘,
# ‘2016-01-05 14:00:00‘, ‘2016-01-05 19:00:00‘,
# ‘2016-01-06 00:00:00‘, ‘2016-01-06 05:00:00‘,
# ‘2016-01-06 10:00:00‘, ‘2016-01-06 15:00:00‘,
# ‘2016-01-06 20:00:00‘, ‘2016-01-07 01:00:00‘,
# ‘2016-01-07 06:00:00‘, ‘2016-01-07 11:00:00‘,
# ‘2016-01-07 16:00:00‘, ‘2016-01-07 21:00:00‘,
# ‘2016-01-08 02:00:00‘, ‘2016-01-08 07:00:00‘,
# ‘2016-01-08 12:00:00‘, ‘2016-01-08 17:00:00‘,
# ‘2016-01-08 22:00:00‘, ‘2016-01-09 03:00:00‘,
# ‘2016-01-09 08:00:00‘, ‘2016-01-09 13:00:00‘,
# ‘2016-01-09 18:00:00‘, ‘2016-01-09 23:00:00‘,
# ‘2016-01-10 04:00:00‘, ‘2016-01-10 09:00:00‘,
# ‘2016-01-10 14:00:00‘, ‘2016-01-10 19:00:00‘,
# ‘2016-01-11 00:00:00‘, ‘2016-01-11 05:00:00‘,
# ‘2016-01-11 10:00:00‘, ‘2016-01-11 15:00:00‘,
# ‘2016-01-11 20:00:00‘, ‘2016-01-12 01:00:00‘,
# ‘2016-01-12 06:00:00‘, ‘2016-01-12 11:00:00‘,
# ‘2016-01-12 16:00:00‘, ‘2016-01-12 21:00:00‘,
# ‘2016-01-13 02:00:00‘, ‘2016-01-13 07:00:00‘,
# ‘2016-01-13 12:00:00‘, ‘2016-01-13 17:00:00‘,
# ‘2016-01-13 22:00:00‘, ‘2016-01-14 03:00:00‘,
# ‘2016-01-14 08:00:00‘, ‘2016-01-14 13:00:00‘,
# ‘2016-01-14 18:00:00‘, ‘2016-01-14 23:00:00‘,
# ‘2016-01-15 04:00:00‘, ‘2016-01-15 09:00:00‘,
# ‘2016-01-15 14:00:00‘, ‘2016-01-15 19:00:00‘,
# ‘2016-01-16 00:00:00‘, ‘2016-01-16 05:00:00‘,
# ‘2016-01-16 10:00:00‘, ‘2016-01-16 15:00:00‘,
# ‘2016-01-16 20:00:00‘, ‘2016-01-17 01:00:00‘,
# ‘2016-01-17 06:00:00‘, ‘2016-01-17 11:00:00‘,
# ‘2016-01-17 16:00:00‘, ‘2016-01-17 21:00:00‘,
# ‘2016-01-18 02:00:00‘, ‘2016-01-18 07:00:00‘,
# ‘2016-01-18 12:00:00‘, ‘2016-01-18 17:00:00‘,
# ‘2016-01-18 22:00:00‘, ‘2016-01-19 03:00:00‘,
# ‘2016-01-19 08:00:00‘, ‘2016-01-19 13:00:00‘,
# ‘2016-01-19 18:00:00‘, ‘2016-01-19 23:00:00‘,
# ‘2016-01-20 04:00:00‘, ‘2016-01-20 09:00:00‘,
# ‘2016-01-20 14:00:00‘, ‘2016-01-20 19:00:00‘,
# ‘2016-01-21 00:00:00‘, ‘2016-01-21 05:00:00‘,
# ‘2016-01-21 10:00:00‘, ‘2016-01-21 15:00:00‘],
# dtype=‘datetime64[ns]‘, freq=‘5H‘)
s2 = Series(np.random.rand(100),index=data_list_new)
print(s2)
"""
"""
#-----------------
#时间序列数据的采样和画图
#t_range = pd.date_range("2016-01-01","2016-12-31")
#print(t_range)
# DatetimeIndex([‘2016-01-01‘, ‘2016-01-02‘, ‘2016-01-03‘, ‘2016-01-04‘,
# ‘2016-01-05‘, ‘2016-01-06‘, ‘2016-01-07‘, ‘2016-01-08‘,
# ‘2016-01-09‘, ‘2016-01-10‘,
# ...
# ‘2016-12-22‘, ‘2016-12-23‘, ‘2016-12-24‘, ‘2016-12-25‘,
# ‘2016-12-26‘, ‘2016-12-27‘, ‘2016-12-28‘, ‘2016-12-29‘,
# ‘2016-12-30‘, ‘2016-12-31‘],
# dtype=‘datetime64[ns]‘, length=366, freq=‘D‘)
#s1 = Series(np.random.randn(len(t_range)),index=t_range)
#print(s1)
#print(s1["2016-01"].mean())#0.05316056209771481
# s1_month = s1.resample("M").mean()#取样,每个月取一个值,值为平均值
# print(s1_month)
# 2016-01-31 0.175917
# 2016-02-29 -0.018886
# 2016-03-31 -0.131760
# 2016-04-30 -0.134704
# 2016-05-31 0.147767
# 2016-06-30 0.382015
# 2016-07-31 0.163278
# 2016-08-31 -0.079203
# 2016-09-30 0.184607
# 2016-10-31 0.055851
# 2016-11-30 0.284106
# 2016-12-31 -0.030083
# Freq: M, dtype: float64
#print(s1.resample("H").ffill())
# 2016-01-01 00:00:00 -2.031085
# 2016-01-01 01:00:00 -2.031085
# 2016-01-01 02:00:00 -2.031085
# ........
t_range = pd.date_range("2016-01-01","2016-12-31",freq="H")
print(t_range)
# DatetimeIndex([‘2016-01-01 00:00:00‘, ‘2016-01-01 01:00:00‘,
# ‘2016-01-01 02:00:00‘, ‘2016-01-01 03:00:00‘,
# ‘2016-01-01 04:00:00‘, ‘2016-01-01 05:00:00‘,
# ‘2016-01-01 06:00:00‘, ‘2016-01-01 07:00:00‘,
# ‘2016-01-01 08:00:00‘, ‘2016-01-01 09:00:00‘,
# ...
# ‘2016-12-30 15:00:00‘, ‘2016-12-30 16:00:00‘,
# ‘2016-12-30 17:00:00‘, ‘2016-12-30 18:00:00‘,
# ‘2016-12-30 19:00:00‘, ‘2016-12-30 20:00:00‘,
# ‘2016-12-30 21:00:00‘, ‘2016-12-30 22:00:00‘,
# ‘2016-12-30 23:00:00‘, ‘2016-12-31 00:00:00‘],
# dtype=‘datetime64[ns]‘, length=8761, freq=‘H‘)
stock_df = DataFrame(index=t_range)
print(stock_df.head())
#Empty DataFrame
# Columns: []
# Index: [2016-01-01 00:00:00, 2016-01-01 01:00:00, 2016-01-01 02:00:00, 2016-01-01 03:00:00, 2016-01-01 04:00:00]
stock_df["BABA"] = np.random.randint(80,160,size=len(t_range))
stock_df["TENCENT"] = np.random.randint(30,50,size=len(t_range))
print(stock_df.head())
# BABA TENCENT
# 2016-01-01 00:00:00 147 47
# 2016-01-01 01:00:00 88 40
# 2016-01-01 02:00:00 143 33
# 2016-01-01 03:00:00 132 47
# 2016-01-01 04:00:00 93 44
# stock_df.plot()
import matplotlib.pyplot as plt
# plt.show()
weekly_df = DataFrame()
weekly_df["BABA"] = stock_df["BABA"].resample("W").mean()
weekly_df["TENCENT"] = stock_df["TENCENT"].resample("W").mean()
print(weekly_df.head())
# BABA TENCENT
# 2016-01-03 113.819444 39.597222
# 2016-01-10 122.696429 39.029762
# 2016-01-17 120.458333 38.845238
# 2016-01-24 119.196429 39.690476
# 2016-01-31 118.315476 38.690476
weekly_df.plot()
plt.show()
"""
"""
#------------------------------
#数据分箱技术Binning
score_list = np.random.randint(25,100,size=20)
print(score_list)#[41 88 82 66 83 84 77 29 72 97 77 81 80 45 30 74 84 46 95 54]
bins = [0,59,70,80,100]
score_cut = pd.cut(score_list,bins)
print(score_cut)
# [(0, 59], (0, 59], (80, 100], (70, 80], (59, 70], ..., (80, 100], (0, 59], (0, 59], (59, 70], (80, 100]]
# Length: 20
# Categories (4, interval[int64]): [(0, 59] < (59, 70] < (70, 80] < (80, 100]]
print(pd.value_counts(score_cut))
# (0, 59] 11
# (80, 100] 4
# (59, 70] 3
# (70, 80] 2
# dtype: int64
df = DataFrame()
df["score"] = score_list
df["student"] = [pd.util.testing.rands(3) for i in range(20)]
df["Categories"] = pd.cut(df["score"],bins,labels=["Low","Ok","Good","Great"])
print(df)
# score student Categories
# 0 71 sCO Good
# 1 40 AgI Low
# 2 61 ubC Ok
# 3 65 P1K Ok
# 4 78 ebd Good
# 5 75 oxG Good
# 6 81 JN0 Great
# 7 35 LpS Low
# 8 53 L7l Low
# 9 60 puw Ok
# 10 27 3KJ Low
# 11 77 2ID Good
# 12 63 D26 Ok
# 13 96 jA7 Great
# 14 46 txB Low
# 15 85 8NF Great
# 16 96 jne Great
# 17 71 xBX Good
# 18 75 3HP Good
# 19 93 Svl Great
"""
#------------------------------------------------
#数据分组技术GroupBy
"""
df = pd.read_csv("city_weather.csv")
# print(df)
# data city temperature wind
# 0 03/01/2016 BJ 8 5
# 1 17/01/2016 BJ 12 2
# 2 31/01/2016 BJ 19 2
# 3 03/02/2016 BJ -3 3
# 4 14/02/2016 BJ 19 2
# 5 13/03/2016 BJ 5 3
# 6 10/03/2016 SH -4 4
# 7 03/04/2016 SH 19 3
# 8 24/04/2016 SH 20 3
# 9 08/05/2016 SH 17 3
# 10 22/05/2016 SH 4 2
# 11 05/06/2016 SH -10 4
# 12 19/06/2016 SH 0 5
# 13 03/07/2016 SH 9 5
# 14 17/07/2016 GZ 10 2
# 15 31/07/2016 GZ -1 5
# 16 14/08/2016 GZ 1 5
# 17 28/08/2016 GZ 25 4
# 18 11/09/2016 SZ 20 1
# 19 25/09/2016 SZ -10 4
g = df.groupby(df["city"])
print(g)
#<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000000000297D780>
print(g.groups)
# ‘BJ‘: Int64Index([0, 1, 2, 3, 4, 5], dtype=‘int64‘), ‘GZ‘: Int64Index([14, 15, 16, 17], dtype=‘int64‘), ‘SH‘: Int64Index([6, 7, 8, 9, 10, 11, 12, 13], dtype=‘int64‘), ‘SZ‘: Int64Index([18, 19], dtype=‘int64‘)
print(g.get_group("BJ"))
# data city temperature wind
# 0 03/01/2016 BJ 8 5
# 1 17/01/2016 BJ 12 2
# 2 31/01/2016 BJ 19 2
# 3 03/02/2016 BJ -3 3
# 4 14/02/2016 BJ 19 2
# 5 13/03/2016 BJ 5 3
df_bj = g.get_group("BJ")
print(df_bj.mean())
# temperature 10.000000
# wind 2.833333
# dtype: float64
print(g.mean())
# temperature wind
# city
# BJ 10.000 2.833333
# GZ 8.750 4.000000
# SH 6.875 3.625000
# SZ 5.000 2.500000
"""
#数据聚合技术Aggregation
df = pd.read_csv("city_weather.csv")
g = df.groupby("city")
print(g.agg("min"))
# data temperature wind
# city
# BJ 03/01/2016 -3 2
# GZ 14/08/2016 -1 2
# SH 03/04/2016 -10 2
# SZ 11/09/2016 -10 1
def foo(attr):
print(type(attr)),print(attr)
return np.nan
print(g.agg(foo))
# <class ‘pandas.core.series.Series‘>
# 0 03/01/2016
# 1 17/01/2016
# 2 31/01/2016
# 3 03/02/2016
# 4 14/02/2016
# 5 13/03/2016
# Name: data, dtype: object
# <class ‘pandas.core.series.Series‘>
# 14 17/07/2016
# 15 31/07/2016
# 16 14/08/2016
# 17 28/08/2016
# Name: data, dtype: object
# <class ‘pandas.core.series.Series‘>
# 6 10/03/2016
# 7 03/04/2016
# 8 24/04/2016
# 9 08/05/2016
# 10 22/05/2016
# 11 05/06/2016
# 12 19/06/2016
# 13 03/07/2016
# Name: data, dtype: object
# <class ‘pandas.core.series.Series‘>
# 18 11/09/2016
# 19 25/09/2016
# Name: data, dtype: object
# <class ‘pandas.core.series.Series‘>
# 0 8
# 1 12
# 2 19
# 3 -3
# 4 19
# 5 5
# Name: temperature, dtype: int64
# <class ‘pandas.core.series.Series‘>
# 14 10
# 15 -1
# 16 1
# 17 25
# Name: temperature, dtype: int64
# <class ‘pandas.core.series.Series‘>
# 6 -4
# 7 19
# 8 20
# 9 17
# 10 4
# 11 -10
# 12 0
# 13 9
# Name: temperature, dtype: int64
# <class ‘pandas.core.series.Series‘>
# 18 20
# 19 -10
# Name: temperature, dtype: int64
# <class ‘pandas.core.series.Series‘>
# 0 5
# 1 2
# 2 2
# 3 3
# 4 2
# 5 3
# Name: wind, dtype: int64
# <class ‘pandas.core.series.Series‘>
# 14 2
# 15 5
# 16 5
# 17 4
# Name: wind, dtype: int64
# <class ‘pandas.core.series.Series‘>
# 6 4
# 7 3
# 8 3
# 9 3
# 10 2
# 11 4
# 12 5
# 13 5
# Name: wind, dtype: int64
# <class ‘pandas.core.series.Series‘>
# 18 1
# 19 4
# Name: wind, dtype: int64
# data temperature wind
# city
# BJ NaN NaN NaN
# GZ NaN NaN NaN
# SH NaN NaN NaN
# SZ NaN NaN NaN
def foo(attr):
return attr.max() - attr.min()
print(g.agg(foo))
# temperature wind
# city
# BJ 22 3
# GZ 26 3
# SH 30 3
# SZ 30 3
g_new = df.groupby(["city","wind"])
print(g_new.groups)
# (‘BJ‘, 2): Int64Index([1, 2, 4], dtype=‘int64‘),
# (‘BJ‘, 3): Int64Index([3, 5], dtype=‘int64‘),
# (‘BJ‘, 5): Int64Index([0], dtype=‘int64‘),
# (‘GZ‘, 2): Int64Index([14], dtype=‘int64‘),
# (‘GZ‘, 4): Int64Index([17], dtype=‘int64‘),
# (‘GZ‘, 5): Int64Index([15, 16], dtype=‘int64‘),
# (‘SH‘, 2): Int64Index([10], dtype=‘int64‘),
# (‘SH‘, 3): Int64Index([7, 8, 9], dtype=‘int64‘),
# (‘SH‘, 4): Int64Index([6, 11], dtype=‘int64‘),
# (‘SH‘, 5): Int64Index([12, 13], dtype=‘int64‘),
# (‘SZ‘, 1): Int64Index([18], dtype=‘int64‘),
# (‘SZ‘, 4): Int64Index([19], dtype=‘int64‘)
print(g_new.get_group(("BJ",3)))
# data city temperature wind
# 3 03/02/2016 BJ -3 3
# 5 13/03/2016 BJ 5 3
#32
以上是关于python数据分析-07时间序列处理的主要内容,如果未能解决你的问题,请参考以下文章
DBSCAN 集群甚至无法处理 40k 数据,但使用 python 和 sklearn 处理 10k 数据