17、频率模型-泊松分布

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了17、频率模型-泊松分布相关的知识,希望对你有一定的参考价值。

参考技术A 17、频率模型-泊松分布

from functools import partial

import numpy as np

import matplotlib.pyplot as plt

import pandas as pd

from sklearn.datasets import fetch_openml

from sklearn.compose import ColumnTransformer

from sklearn.linear_model import PoissonRegressor

from sklearn.metrics import mean_tweedie_deviance

from sklearn.model_selection import train_test_split

from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import FunctionTransformer, OneHotEncoder

from sklearn.preprocessing import StandardScaler, KBinsDiscretizer

from sklearn.metrics import mean_absolute_error, mean_squared_error

def load_mtpl2(n_samples=100000):

    # freMTPL2freq dataset from https://www.openml.org/d/41214

    df_freq = fetch_openml(data_id=41214, as_frame=True)['data']

    df_freq['IDpol'] = df_freq['IDpol'].astype(np.int)

    df_freq.set_index('IDpol', inplace=True)

    # freMTPL2sev dataset from https://www.openml.org/d/41215

    df_sev = fetch_openml(data_id=41215, as_frame=True)['data']

    # sum ClaimAmount over identical IDs

    df_sev = df_sev.groupby('IDpol').sum()

    df = df_freq.join(df_sev, how="left")

    df["ClaimAmount"].fillna(0, inplace=True)

    # unquote string fields

    for column_name in df.columns[df.dtypes.values == np.object]:

        df[column_name] = df[column_name].str.strip("'")

    return df.iloc[:n_samples]

def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None,

                  title=None, ax=None, fill_legend=False):

    # aggregate observed and predicted variables by feature level

    df_ = df.loc[:, [feature, weight]].copy()

    df_["observed"] = df[observed] * df[weight]

    df_["predicted"] = predicted * df[weight]

    df_ = (

        df_.groupby([feature])[[weight, "observed", "predicted"]]

        .sum()

        .assign(observed=lambda x: x["observed"] / x[weight])

        .assign(predicted=lambda x: x["predicted"] / x[weight])

    )

    ax = df_.loc[:, ["observed", "predicted"]].plot(style=".", ax=ax)

    y_max = df_.loc[:, ["observed", "predicted"]].values.max() * 0.8

    p2 = ax.fill_between(

        df_.index,

        0,

        y_max * df_[weight] / df_[weight].values.max(),

        color="g",

        alpha=0.1,

    )

    if fill_legend:

        ax.legend([p2], [" distribution".format(feature)])

    ax.set(

        ylabel=y_label if y_label is not None else None,

        title=title if title is not None else "Train: Observed vs Predicted",

    )

def score_estimator(

    estimator, X_train, X_test, df_train, df_test, target, weights,

    tweedie_powers=None,

):

    """Evaluate an estimator on train and test sets with different metrics"""

    metrics = [

        ("D² explained", None),  # Use default scorer if it exists

        ("mean abs. error", mean_absolute_error),

        ("mean squared error", mean_squared_error),

    ]

    if tweedie_powers:

        metrics += [(

            "mean Tweedie dev p=:.4f".format(power),

            partial(mean_tweedie_deviance, power=power)

        ) for power in tweedie_powers]

    res = []

    for subset_label, X, df in [

        ("train", X_train, df_train),

        ("test", X_test, df_test),

    ]:

        y, _weights = df[target], df[weights]

        for score_label, metric in metrics:

            if isinstance(estimator, tuple) and len(estimator) == 2:

                # Score the model consisting of the product of frequency and

                # severity models.

                est_freq, est_sev = estimator

                y_pred = est_freq.predict(X) * est_sev.predict(X)

            else:

                y_pred = estimator.predict(X)

            if metric is None:

                if not hasattr(estimator, "score"):

                    continue

                score = estimator.score(X, y, sample_weight=_weights)

            else:

                score = metric(y, y_pred, sample_weight=_weights)

            res.append(

                "subset": subset_label, "metric": score_label, "score": score

            )

    res = (

        pd.DataFrame(res)

        .set_index(["metric", "subset"])

        .score.unstack(-1)

        .round(4)

        .loc[:, ['train', 'test']]

    )

    return res

df = load_mtpl2(n_samples=60000)

# Note: filter out claims with zero amount, as the severity model

# requires strictly positive target values.

df.loc[(df["ClaimAmount"] == 0) & (df["ClaimNb"] >= 1), "ClaimNb"] = 0

# Correct for unreasonable observations (that might be data error)

# and a few exceptionally large claim amounts

df["ClaimNb"] = df["ClaimNb"].clip(upper=4)

df["Exposure"] = df["Exposure"].clip(upper=1)

df["ClaimAmount"] = df["ClaimAmount"].clip(upper=200000)

log_scale_transformer = make_pipeline(

    FunctionTransformer(func=np.log),

    StandardScaler()

)

column_trans = ColumnTransformer(

    [

        ("binned_numeric", KBinsDiscretizer(n_bins=10),

            ["VehAge", "DrivAge"]),

        ("onehot_categorical", OneHotEncoder(),

            ["VehBrand", "VehPower", "VehGas", "Region", "Area"]),

        ("passthrough_numeric", "passthrough",

            ["BonusMalus"]),

        ("log_scaled_numeric", log_scale_transformer,

            ["Density"]),

    ],

    remainder="drop",

)

X = column_trans.fit_transform(df)

# Insurances companies are interested in modeling the Pure Premium, that is

# the expected total claim amount per unit of exposure for each policyholder

# in their portfolio:

df["PurePremium"] = df["ClaimAmount"] / df["Exposure"]

# This can be indirectly approximated by a 2-step modeling: the product of the

# Frequency times the average claim amount per claim:

df["Frequency"] = df["ClaimNb"] / df["Exposure"]

df["AvgClaimAmount"] = df["ClaimAmount"] / np.fmax(df["ClaimNb"], 1)

with pd.option_context("display.max_columns", 15):

    print(df[df.ClaimAmount > 0].head())

df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=0)

# The parameters of the model are estimated by minimizing the Poisson deviance

# on the training set via a quasi-Newton solver: l-BFGS. Some of the features

# are collinear, we use a weak penalization to avoid numerical issues.

glm_freq = PoissonRegressor(alpha=1e-3, max_iter=400)

glm_freq.fit(X_train, df_train["Frequency"],

            sample_weight=df_train["Exposure"])

scores = score_estimator(

    glm_freq,

    X_train,

    X_test,

    df_train,

    df_test,

    target="Frequency",

    weights="Exposure",

)

print("Evaluation of PoissonRegressor on target Frequency")

print(scores)

fig, ax = plt.subplots(ncols=2, nrows=2, figsize=(16, 8))

fig.subplots_adjust(hspace=0.3, wspace=0.2)

plot_obs_pred(

    df=df_train,

    feature="DrivAge",

    weight="Exposure",

    observed="Frequency",

    predicted=glm_freq.predict(X_train),

    y_label="Claim Frequency",

    title="train data",

    ax=ax[0, 0],

)

plot_obs_pred(

    df=df_test,

    feature="DrivAge",

    weight="Exposure",

    observed="Frequency",

    predicted=glm_freq.predict(X_test),

    y_label="Claim Frequency",

    title="test data",

    ax=ax[0, 1],

    fill_legend=True

)

plot_obs_pred(

    df=df_test,

    feature="VehAge",

    weight="Exposure",

    observed="Frequency",

    predicted=glm_freq.predict(X_test),

    y_label="Claim Frequency",

    title="test data",

    ax=ax[1, 0],

    fill_legend=True

)

plot_obs_pred(

    df=df_test,

    feature="BonusMalus",

    weight="Exposure",

    observed="Frequency",

    predicted=glm_freq.predict(X_test),

    y_label="Claim Frequency",

    title="test data",

    ax=ax[1, 1],

    fill_legend=True

)

泊松分布

泊松分布泊松分布适用于在随机时间和空间上发生事件的情况,其中,我们只关注事件发生的次数。
当以下假设有效时,则称为泊松分布:
* 任何一个成功的事件都不应该影响另一个成功的事件。
* 在短时间内成功的概率必须等于在更长的时间内成功的概率。
* 时间间隔很小时,成功的概率趋近于零。

??是事件发生的频率,也就是单位时间(或单位面积内随机事件的平均发生率),t是时间间隔,X是该时间间隔内的事件数。X称为泊松随机变量,X的概率分布称为泊松分布。??表示长度为t的时间间隔中的平均事件数,那么??=??*t。

通俗定义:假定一个事件在一段时间内随机发生,且符合以下条件:
* 将该时间段无限分隔成若干个小的时间段,在这个接近于零的小时间段内,该事件发生一次的概率与这个极小时间段的长度成正比。
* 在每一个极小时间段内,该事件发生两次及两次以上的概率恒等于零。
* 该事件在不同的小时间段内,发生与否相互独立。
公式:
$$fleft( x|lambda ight) =dfrac {lambda ^{x}e^{-lambda }}{x!}$$


 

泊松分布相关代码

#导入库
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
from collections import Counter

PMF

#PMF
plt.figure(figsize=(14,7))

x=np.arange(20)
y=stats.poisson.pmf(x,mu=5)

plt.figure(figsize=(14,7))
plt.bar(x,
        y,
        width=0.75,
        alpha=0.5,
        color=b,
        label=PMF,
       )

plt.legend()
plt.show()

技术图片

 ??值的影响

#??值不同
plt.figure(figsize=(14,7))

x=np.arange(20)
y1=stats.poisson.pmf(x,mu=1)
y2=stats.poisson.pmf(x,mu=5)
y3=stats.poisson.pmf(x,mu=10)

plt.scatter(x,
            y1,
            alpha=0.5,
            color=r,
            s=100,
           )

plt.plot(x,
         y1,
         alpha=0.5,
         color=r,
         label=mu=1,
        )

plt.scatter(x,
            y2,
            alpha=0.5,
            color=g,
            s=100,
           )

plt.plot(x,
         y2,
         alpha=0.5,
         color=g,
         label=mu=5,
        )

plt.scatter(x,
            y3,
            alpha=0.5,
            color=b,
            s=100,
           )

plt.plot(x,
         y3,
         alpha=0.5,
         color=b,
         label=mu=10,
        )

plt.legend()
plt.show()

技术图片

 

随机样本

#随机样本
np.random.seed(0)
print(stats.poisson.rvs(mu=10),end=

)
print(stats.poisson.rvs(mu=10,size=10),end=

)

技术图片

 

CDF

#CDF
x=np.arange(20)
y=stats.poisson.cdf(x,mu=5)

plt.figure(figsize=(14,7))
plt.plot(x,
         y,
         color=b,
         label=CDF
        )

plt.legend()
plt.show()

技术图片

 

区间概率

print(P(X<3)={}.format(stats.poisson.cdf(k=3,mu=5)))
print(P(2<X<=8)={}.format(stats.poisson.cdf(k=8,mu=5)-stats.poisson.cdf(k=2,mu=5)))

技术图片 

 

以上是关于17、频率模型-泊松分布的主要内容,如果未能解决你的问题,请参考以下文章

泊松分布和指数分布

用Python实现泊松分布,设计保险收益预测模型,分析保险公司实现盈利的可能

c++数值58,随机泊松分布,分段线性分布

泊松分布

拟合离散数据:负二项式、泊松、几何分布

随机过程12 - 泊松过程的推广型