python 熊猫片段

Posted 2021-05-08
tags:
篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了python 熊猫片段相关的知识，希望对你有一定的参考价值。
import pandas as pd
# 表示行数　変更
pd.set_option('display.max_rows', 100)

# import file
df = pd.read_csv("/path/to/file")

# import sql data
import sqlite3
con = sqlite3.connect('hoge.db') # sqliteの例だが、mysqlなどDBに合わせてconnectorは用意する
pd.read_sql('select * from hoge', con)

# export csv
pd.DataFrame(re_kanji).to_csv("/path/to/file")

# select: 特定の列、行の抽出
df[["property1", "property2"]]
df.loc[0:100, ["property1", "property2"]] # 行数を指定したい場合
df.iloc["行番号", "列番号"] # 番号、スライスで指定可能
df.iloc[0,:] # 0行目の全ての列を取得する場合


# grouping(集計)
## ノーマルな集計
df.groupby("category_property")["numeric_property"].count() # カウンティング

## groupbyするとindexが付与されてしまって、それが邪魔だったりするので,インデックスを解除
df.groupby("category_property")["numeric_property"].count().reset_index()

## 日時で集計
df = pd.read_csv("/path/to/file", index_col="<グルーピンしたい日付属性>", parse_dates=True)
df["count"] = 1 # 集計のために必要
df = df.resample("M").sum() # 年の場合はY, 週の場合はW
# 標準規格ではない日付フォーマットの場合
# parser = lambda date: pd.to_datetime(date, format='%Y年%m月%d日')
# df_jp = pd.read_csv('data/src/sample_date_jp.csv', index_col='date', parse_dates=True, date_parser=parser)

## 時間の丸め処理
df['date'] = df['timestamp'].dt.date
df['hour'] = df['timestamp'].dt.hour
df['15min'] = pd.DatetimeIndex(df['timestamp']).round('15min') #15分間隔

## まとめて集約処理
### 全属性に対して複数の集計
df.groupby('class').aggregate([np.mean, np.sum])

### 属性に毎に異なる集計
df = df \
  .groupby('property_1') \
  .agg({'property_2': 'count', 'property_3': 'nunique'}) \
  .reset_index()
df.columns = ['hoge_id', 'hoge1_cnt', 'hoge2_cnt']

### 各ユーザーごとの最大最小
df.sort_values('property', ascending=False).groupby(['user_id'], as_index=False).first()

### データを見やすくするため、ピボットテーブルを使う
#hobby_years_coding_count = df[['Hobby', 'YearsCoding']].\
#    groupby(['Hobby', 'YearsCoding']).\
#    size().\
#    reset_index(name='counts').\
#    pivot(index='YearsCoding', columns='Hobby', values='counts')
#hobby_years_coding_count


# ビン分割(離散的データを指定間隔で分割)
terms = [1990, 1995, 2000] # この場合は、1990~1995, 1995~2000 で分割する
df_cut_data1 = pd.cut(df["property"], terms) # 以前はleft, 以降にはrightオプションが使える
pd.value_counts(df_cut_data1)
df_cut_data2 = pd.cut(df["property"], 5) # 指定数字で等分も可能 ※小数点以下も出てくる
df_cut_data3 = pd.qcut(df["property"], 5) # 分位点で分割してくれる 


# where
df.query("property == 'value'", inplace=True)
df.query("venue_category in ['Airport', 'Subway']")
# df = df[df["property"].isin(["value"])]
# result = df[df['property'].str.match(r'regex_pattern')]

## 期間指定
import datetime as dt
df["_time"] = pd.to_datetime(df["_time"])
df = df.query('"2016-10-13" <= _time <= "2016-10-14"')
# df = df[(dt.datetime(2015,1,1) < df["_time"]) & (df["_time"] <= dt.datetime(2018,12,31))]

## null チェック
df.isnull().sum()
df.info()

## null 以外の行を抽出
df.query("col == col")

# ★ 関数の適用
## 各列（axis=0)または各行（axis=1)に関数を適用. 戻り値はpandas.Series
df.apply(lambda x: sum(x), axis=1)

## データフレーム全体に関数を適用
df.pipe(lambda df: df[df.value > -2])


# covert
## df <-> list
lists = df.values.tolist()
df = pd.DataFrame(lists)

## type changed
df.to_XX # csv, json, dict..etc.

## convert upper 
df["property"] = df["property"].str.upper()




# 列の追加
df.assign(
    round_hoge=lambda df: df.hoge.round(), # カラム毎
    total=lambda df: df.apply(lambda row: sum(row), axis=1) # 行全体
)

# delete
## duplicated drop (元のデータから削除する)
df.drop_duplicates(subset=["property"], keep = "first", inplace=True)


## 列のdrop (元のデータから削除する)
df.drop(["property"], axis = 1,  inplace=True)

## Nanの値を持つ行をdrop
df.dropna() # 全ての属性がNanの場合は削除にする。ということも可能

#  結合方法は2つある。merge か join . index同士の結合はjoin
df_merged = df_a.merge(df_b, on="KEY")
# df_a.join(df_b)

# unique count (not duplicated)  http://pynote.hatenablog.com/entry/pandas-process-duplicates
df.nunique()

# order by (sort)
# 降順
df.sort_values(["COL1","COL2"] , ascending=False)

# 複雑な処理は、関数化して.pipe()でその関数を呼び出す
def hoge_func(df):
    return df * 3
df = pd.read_csv('/path/to/file', low_memory=False).\
    pipe(hoge_func)

# 単純なものはLambda式を使うのもあり
df.pipe(lambda df: df[df.f1 > 0])


# その他、awesomeなpandasの処理はこれを参考にしてみる
# https://github.com/ghmagazine/awesomebook/tree/master/preprocess
# https://data.gunosy.io/entry/pandas-pratical-analysis
以上是关于python 熊猫片段的主要内容，如果未能解决你的问题，请参考以下文章