def chunk_load(path, file, sample_ratio, seed, usecols=None, chunksize=None, encoding=None, sep=None, names=None, dtype_dict = None):
    '''Loading data by the chunk method
    
    Args : 
    -------
        path : str
            Path of data
        file : str
            File name
        size : int
            Any interger, But not 0. Represent the size of every chunk of the data
        dtype_dict : dictionary, default=None
            Dictionary contains column name and column format. Ex.{age:int8}

    Returns : 
    -------
        data : pandas.dataframe
    '''
    data_chunk = pd.read_csv(f'{path}{file}', encoding=encoding, chunksize=chunksize
                             , sep=sep, usecols=usecols, names=names, dtype=dtype_dict, header=None)
    data_temp = []
    for chunk in tqdm(data_chunk):
        sample_chunk = chunk.sample(frac=sample_ratio, random_state=seed)
        data_temp.append(sample_chunk)
    data = pd.concat(data_temp, axis=0)
    del data_temp, data_chunk, sample_chunk, chunk
    return data

# ex

fdir = '/kaggle/input/xxxxxxxxxxx'
file = 'xxxxx.txt'
sample_ratio = 0.05
seed = 39

data = chunk_load(fdir, file, sample_ratio, seed, usecols=None, chunksize=10**6, encoding=None, sep='\t', names=None, dtype_dict = None)

2023-08-18

忘備録　メモリーの有効利用

忘備録 Python

データが大きくなるとメモリー残量を気にしながらモデル作成しないとメモリーオーバーで落ちる。
これはかなり悲しい出来事です。
そこで↓
kagleで紹介されていた関数。

def reduce_mem_usage(df, verbose=False):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

もっと細かい管理はこっち↓
本来、不要らしいけど。。

import gc

gc.collect()

2023-08-17

忘備録　LightGBM、CatBoost：二値分類、目的関数への重みづけ

忘備録

二値分類タスクでは、クラスの不均衡性で目的関数の重みづけが重要となる場合があります。

LightGBMの場合

params = {
            'objective' : objective,
            'metric' : metric,
            'boosting_type': boosting_type,
            'device': device,
            'random_state':39,
            'is_unbalance': True, #設定する
            'verbose':-1
        }

# 重みを計算する関数
def calc_log_loss_weight(y_true):
    nc = np.bincount(y_true)
    w0, w1 = 1/(nc[0]/y_true.shape[0]), 1/(nc[1]/y_true.shape[0])
    return w0, w1

#重みを計算
 train_w0, train_w1 = calc_log_loss_weight(t_train)
 valid_w0, valid_w1 = calc_log_loss_weight(t_test)
 print(train_w0, train_w1)

# 重みを反映
 lgb_train = lgb.Dataset(X_train, t_train, weight=pd.Series(t_train).map({0: train_w0, 1: train_w1}) )
 lgb_eval  = lgb.Dataset(X_test, t_test, weight=pd.Series(t_test).map({0: valid_w0, 1: valid_w1}))

CatBoostの場合
基本は同じ

# 重みを計算する関数
def calc_log_loss_weight(y_true):
    nc = np.bincount(y_true)
    w0, w1 = 1/(nc[0]/y_true.shape[0]), 1/(nc[1]/y_true.shape[0])
    return w0, w1

#重みを計算
train_w0, train_w1 = calc_log_loss_weight(y_train)
valid_w0, valid_w1 = calc_log_loss_weight(y_eval)
print(train_w0, train_w1)
            
        
# 重みを反映
xgb_train = Pool(data=X_train, label=y_train, cat_features=cf, weight=pd.Series(y_train).map({0: train_w0, 1: train_w1}).values)
xgb_eval = Pool(data=X_eval, label=y_eval, cat_features=cf, weight=pd.Series(y_eval).map({0: valid_w0, 1: valid_w1}).values)

#確率で予測
pred = model.predict(xgb_eval, prediction_type='Probability',ntree_end=bst.best_iteration_)

その他
www.kaggle.com

2023-07-04

忘備録 tsflex:時系列処理と特徴抽出

忘備録

kaggle Parkinson's Freezing of Gait Predictionの中で見つけたライブラリ。

時系列データのいろいろな処理で一括でできてしかも軽い。

www.kaggle.com

残念ながらこのコンペの上位はNNを使用したモデルで占められていてlightbgmは惨敗でした。信号や画像のデータはやっぱりNNが強いのか。

Parkinson's Freezing of Gait Prediction

Why5

巷にあふれる情報（データ）をもとに"なぜ”、"なぜ”、"なぜ”、、、と掘り下げるブログです。現在は主にAIによる予測スキルの向上に励んでいます。

忘備録　ポイント情報をメッシュに落とし込む

忘備録　LightGBM カスタムメトリックの使い方

忘備録　Catboost eval_metricの使い方

忘備録　大きなデータを分割読み込みする方法

忘備録　メモリーの有効利用

忘備録　LightGBM、CatBoost：二値分類、目的関数への重みづけ

忘備録 tsflex:時系列処理と特徴抽出

Parkinson's Freezing of Gait Prediction

Parkinson's Freezing of Gait Prediction

Parkinson's Freezing of Gait Prediction

Parkinson's Freezing of Gait Prediction

Parkinson's Freezing of Gait Prediction

Parkinson's Freezing of Gait Prediction

Parkinson's Freezing of Gait Prediction

Parkinson's Freezing of Gait Prediction