60种特征工程操作:使用自定义聚合函数
Posted 我爱Python数据挖掘
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了60种特征工程操作:使用自定义聚合函数相关的知识,希望对你有一定的参考价值。
agg
是一个聚合函数,使用指定轴上的一个或多个操作进行聚合。通过agg
函数,可以同时对多列进行提取特征,非常适合用于特征工程。
本文分享60种特征工程操作:使用自定义聚合函数,喜欢记得收藏、关注、点赞。
完整资料、技术交流,文末获取。
内置的聚合函数
在Pandas
内部支持了13中聚合函数,可以在分组之后进行使用:
-
mean()
:分组均值 -
sum()
:分组求和 -
size()
:分组个数 -
count()
:分组大小 -
std()
:分组标准差 -
var()
:分组方差 -
sem()
:均值误差 -
describe()
:分组描述 -
first()
:分组第一个元素 -
last()
:分组最后一个元素 -
nth()
:分组第N个元素 -
min()
:分组最小值 -
max()
:分组最大值
案例如下,有多种使用方式可供选择:
# 定义模型
df = pd.DataFrame('group':[1,1,2,2],
'values':[4,1,1,2],
'values2':[0,1,1,2]
)
# 分组对两列求均值
df.groupby('group').mean()
# 分组对两列求均值、标准差
df.groupby('group').agg([np.mean,np.std])
# 分组对两列分别聚合
df.groupby('group').agg(
'values':['mean','median'],
'values2':['mean','std']
)
自定义聚合函数
如果在Pandas内部的聚合函数不满足要求,也可以自定义聚合函数搭配使用
median
def median(x):
return np.median(x)
variation_coefficient
def variation_coefficient(x):
mean = np.mean(x)
if mean != 0:
return np.std(x) / mean
else:
return np.nan
variance
def variance(x):
return np.var(x)
skewness
def skewness(x):
if not isinstance(x, pd.Series):
x = pd.Series(x)
return pd.Series.skew(x)
kurtosis
def kurtosis(x):
if not isinstance(x, pd.Series):
x = pd.Series(x)
return pd.Series.kurtosis(x)
standard_deviation
def standard_deviation(x):
return np.std(x)
large_standard_deviation
def large_standard_deviation(x):
if (np.max(x)-np.min(x)) == 0:
return np.nan
else:
return np.std(x)/(np.max(x)-np.min(x))
variation_coefficient
def variation_coefficient(x):
mean = np.mean(x)
if mean != 0:
return np.std(x) / mean
else:
return np.nan
variance_std_ratio
def variance_std_ratio(x):
y = np.var(x)
if y != 0:
return y/np.sqrt(y)
else:
return np.nan
ratio_beyond_r_sigma
def ratio_beyond_r_sigma(x, r):
if x.size == 0:
return np.nan
else:
return np.sum(np.abs(x - np.mean(x)) > r * np.asarray(np.std(x))) / x.size
range_ratio
def range_ratio(x):
mean_median_difference = np.abs(np.mean(x) - np.median(x))
max_min_difference = np.max(x) - np.min(x)
if max_min_difference == 0:
return np.nan
else:
return mean_median_difference / max_min_difference
has_duplicate_max
def has_duplicate_max(x):
return np.sum(x == np.max(x)) >= 2
has_duplicate_min
def has_duplicate_min(x):
return np.sum(x == np.min(x)) >= 2
has_duplicate
def has_duplicate(x):
return x.size != np.unique(x).size
count_duplicate_max
def count_duplicate_max(x):
return np.sum(x == np.max(x))
count_duplicate_min
def count_duplicate_min(x):
return np.sum(x == np.min(x))
count_duplicate
def count_duplicate(x):
return x.size - np.unique(x).size
sum_values
def sum_values(x):
if len(x) == 0:
return 0
return np.sum(x)
log_return
def log_return(list_stock_prices):
return np.log(list_stock_prices).diff()
realized_volatility
def realized_volatility(series):
return np.sqrt(np.sum(series**2))
realized_abs_skew
def realized_abs_skew(series):
return np.power(np.abs(np.sum(series**3)),1/3)
realized_skew
def realized_skew(series):
return np.sign(np.sum(series**3))*np.power(np.abs(np.sum(series**3)),1/3)
realized_vol_skew
def realized_vol_skew(series):
return np.power(np.abs(np.sum(series**6)),1/6)
realized_quarticity
def realized_quarticity(series):
return np.power(np.sum(series**4),1/4)
count_unique
def count_unique(series):
return len(np.unique(series))
count
def count(series):
return series.size
maximum_drawdown
def maximum_drawdown(series):
series = np.asarray(series)
if len(series)<2:
return 0
k = series[np.argmax(np.maximum.accumulate(series) - series)]
i = np.argmax(np.maximum.accumulate(series) - series)
if len(series[:i])<1:
return np.NaN
else:
j = np.max(series[:i])
return j-k
maximum_drawup
def maximum_drawup(series):
series = np.asarray(series)
if len(series)<2:
return 0
series = - series
k = series[np.argmax(np.maximum.accumulate(series) - series)]
i = np.argmax(np.maximum.accumulate(series) - series)
if len(series[:i])<1:
return np.NaN
else:
j = np.max(series[:i])
return j-k
drawdown_duration
def drawdown_duration(series):
series = np.asarray(series)
if len(series)<2:
return 0
k = np.argmax(np.maximum.accumulate(series) - series)
i = np.argmax(np.maximum.accumulate(series) - series)
if len(series[:i]) == 0:
j=k
else:
j = np.argmax(series[:i])
return k-j
drawup_duration
def drawup_duration(series):
series = np.asarray(series)
if len(series)<2:
return 0
series=-series
k = np.argmax(np.maximum.accumulate(series) - series)
i = np.argmax(np.maximum.accumulate(series) - series)
if len(series[:i]) == 0:
j=k
else:
j = np.argmax(series[:i])
return k-j
max_over_min
def max_over_min(series):
if len(series)<2:
return 0
if np.min(series) == 0:
return np.nan
return np.max(series)/np.min(series)
mean_n_absolute_max
def mean_n_absolute_max(x, number_of_maxima = 1):
""" Calculates the arithmetic mean of the n absolute maximum values of the time series."""
assert (
number_of_maxima > 0
), f" number_of_maxima=number_of_maxima which is not greater than 1"
n_absolute_maximum_values = np.sort(np.absolute(x))[-number_of_maxima:]
return np.mean(n_absolute_maximum_values) if len(x) > number_of_maxima else np.NaN
count_above
def count_above(x, t):
if len(x)==0:
return np.nan
else:
return np.sum(x >= t) / len(x)
count_below
def count_below(x, t):
if len(x)==0:
return np.nan
else:
return np.sum(x <= t) / len(x)
number_peaks
def number_peaks(x, n):
x_reduced = x[n:-n]
res = None
for i in range(1, n + 1):
result_first = x_reduced > _roll(x, i)[n:-n]
if res is None:
res = result_first
else:
res &= result_first
res &= x_reduced > _roll(x, -i)[n:-n]
return np.sum(res)
mean_abs_change
def mean_abs_change(x):
return np.mean(np.abs(np.diff(x)))
mean_change
def mean_change(x):
x = np.asarray(x)
return (x[-1] - x[0]) / (len(x) - 1) if len(x) > 1 else np.NaN
mean_second_derivative_central
def mean_second_derivative_central(x):
x = np.asarray(x)
return (x[-1] - x[-2] - x[1] + x[0]) / (2 * (len(x) - 2)) if len(x) > 2 else np.NaN
root_mean_square
def root_mean_square(x):
return np.sqrt(np.mean(np.square(x))) if len(x) > 0 else np.NaN
absolute_sum_of_changes
def absolute_sum_of_changes(x):
return np.sum(np.abs(np.diff(x)))
longest_strike_below_mean
def longest_strike_below_mean(x):
if not isinstance(x, (np.ndarray, pd.Series)):
x = np.asarray(x)
return np.max(_get_length_sequences_where(x < np.mean(x))) if x.size > 0 else 0
longest_strike_above_mean
def longest_strike_above_mean(x):
if not isinstance(x, (np.ndarray, pd.Series)):
x = np.asarray(x)
return np.max(_get_length_sequences_where(x > np.mean(x))) if x.size > 0 else 0
count_above_mean
def count_above_mean(x):
m = np.mean(x)
return np.where(x > m)[0].size
count_below_mean
def count_below_mean(x):
m = np.mean(x)
return np.where(x < m)[0].size
last_location_of_maximum
def last_location_of_maximum(x):
x = np.asarray(x)
return 1.0 - np.argmax(x[::-1]) / len(x) if len(x) > 0 else np.NaN
first_location_of_maximum
def first_location_of_maximum(x):
if not isinstance(x, (np.ndarray, pd.Series)):
x = np.asarray(x)
return np.argmax(x) / len(x) if len(x) > 0 else np.NaN
last_location_of_minimum
def last_location_of_minimum(x):
x = np.asarray(x)
return 1.0 - np.argmin(x[::-1])以上是关于60种特征工程操作:使用自定义聚合函数的主要内容,如果未能解决你的问题,请参考以下文章
95-910-148-源码-FlinkSQL-Flink SQL自定义聚合函数