1. Predict
def predict(
model,
dataset,
batch_size = 64,
device = 'cuda'
):
model.eval()
loader = DataLoader(
dataset, batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS
)
preds = []
with torch.no_grad():
for data in loader:
pred = model(data['input'].to(device)).squeeze(-1)
preds.append(pred.detach().cpu().numpy())
preds = np.concatenate(preds, 0)
● Torch.no_grad()
- gradient 계산을 안하게끔 만드는 context-manager
- model 추론 ( validation OR test data로 하는 과정들 ) 과정에서 사용한다. 쓸데없는 메모리 소비를 줄인다.
- 다른 thread에 영향을 미치지 않는다(thread local)
● with ~~:
- 자원을 획득하고 사용 후 반납해야 하는 경우 사용
- 객체의 lifecycle(생성 > 사용 > 소멸)을 설계할 수 있다.
- e.g. : with open([file]): file을 열었을 때 닫아주어야 하는데, 보통 close()를 써도 되는데, 파일 처리를 수행하는 도중 오류가 발생하면 close()가 실행이 안될 수도 있음. 이럴 때, with statement안에 넣어줬으면, 에러가 발생하든 안하든 close()가 실행됨
- with Torch.no_grad():
2. Group K-fold
from sklearn.model_selection import GroupKFold
def k_fold(config, df, df_test):
"""
Performs a patient grouped k-fold cross validation.
"""
pred_oof = np.zeros(len(df))
preds_test = []
gkf = GroupKFold(n_splits=config.k)
splits = list(gkf.split(X=df, y=df, groups=df["breath_id"]))
for i, (train_idx, val_idx) in enumerate(splits):
if i in config.selected_folds:
print(f"\n------------- Fold {i + 1} / {config.k} -------------\n")
df_train = df.iloc[train_idx].copy().reset_index(drop=True)
df_val = df.iloc[val_idx].copy().reset_index(drop=True)
pred_val, pred_test = train(config, df_train, df_val, df_test, i)
pred_oof[val_idx] = pred_val.flatten()
preds_test.append(pred_test.flatten())
print(f'\n -> CV MAE : {compute_metric(df, pred_oof) :.3f}')
return pred_oof, np.mean(preds_test, 0)
- Group K-fold(순서를 잘 기억)
gkf = GroupKFold(n_splits= n_splits) # 객체 생성
splits = gkf.split(X=X, y=y, groups=group)
for i, (train_idx, val_idx) in enumerate(splits):
print( ' Fold ~~~ ' )
# X_train, X_test, y_train, y_teset 지정
X_train, X_test = ...
y_train, y_test = ...
....
pred_val, pred_test = ... (학습)
....
- Flatten
: n차원 데이터를 1차원으로 바꿔버림
3. Plot_prediction : predict와 실제값을 비교
def plot_prediction(sample_id, df):
df_breath = df[df['breath_id'] == sample_id]
cols = ['u_in', 'u_out', 'pressure'] if 'pressure' in df.columns else ['u_in', 'u_out']
plt.figure(figsize=(12, 4))
for col in ['pred', 'pressure', 'u_out']:
plt.plot(df_breath['time_step'], df_breath[col], label=col)
metric = compute_metric(df_breath, df_breath['pred'])
plt.legend()
plt.title(f'Sample {sample_id} - MAE={metric:.3f}')
4. submission
sub['pressure'] = pred_test
sub.to_csv('submission.csv', index=False)
5. utility
import os
import torch
import random
import numpy as np
def seed_everything(seed):
'''
random seed
Args: seed(int): Number of the seed.
'''
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
def count_parameters(model, all=False):
'''
Counts the parameters of a model.
Args:
model ( torch model ) Model to count the parameters of.
all (bool, optimal): train할 수 없는 매개변수를 count할지에 대한 여부. 디폴트로 false
Returns:
int: Number of parameters
'''
if all:
return sum(p.numel() for p in model.parameters())
else:
return sum(p.numel() for p in model.parameters() if p.requires_grad)
def worker_init_fn(worker_id):
'''
Handles PyTorch x Numpy seeding issues.
Args:
worker_id ( int ): Id of the worker.
'''
np.random.seed(np.random.get_state()[1][0] + worker_id)
def save_model_weights(model, filename, verbose=1, cp_folder=''):
'''
pytorch model의 weight를 save.
Args:
model (torch model ): Model to save the weights of
filename ( str ): Name of checkpoint
verbose (int, optional): Whether to display infos. Defaults to 1.
cp_folder (str, optional): Folder to save to. Defaults to ''.
'''
if verbose:
print(f'\n -> Saving weights to {os.path.join(cp_folder, filename)}\n')
torch.save(model.state_dict(), os.path.join(cp_folder, filename))
참고자료
https://m.blog.naver.com/PostView.naver?isHttpsRedirect=true&blogId=wideeyed&logNo=221653260516
'정리 > Machine Learning' 카테고리의 다른 글
1. Predict
def predict(
model,
dataset,
batch_size = 64,
device = 'cuda'
):
model.eval()
loader = DataLoader(
dataset, batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS
)
preds = []
with torch.no_grad():
for data in loader:
pred = model(data['input'].to(device)).squeeze(-1)
preds.append(pred.detach().cpu().numpy())
preds = np.concatenate(preds, 0)
● Torch.no_grad()
- gradient 계산을 안하게끔 만드는 context-manager
- model 추론 ( validation OR test data로 하는 과정들 ) 과정에서 사용한다. 쓸데없는 메모리 소비를 줄인다.
- 다른 thread에 영향을 미치지 않는다(thread local)
● with ~~:
- 자원을 획득하고 사용 후 반납해야 하는 경우 사용
- 객체의 lifecycle(생성 > 사용 > 소멸)을 설계할 수 있다.
- e.g. : with open([file]): file을 열었을 때 닫아주어야 하는데, 보통 close()를 써도 되는데, 파일 처리를 수행하는 도중 오류가 발생하면 close()가 실행이 안될 수도 있음. 이럴 때, with statement안에 넣어줬으면, 에러가 발생하든 안하든 close()가 실행됨
- with Torch.no_grad():
2. Group K-fold
from sklearn.model_selection import GroupKFold
def k_fold(config, df, df_test):
"""
Performs a patient grouped k-fold cross validation.
"""
pred_oof = np.zeros(len(df))
preds_test = []
gkf = GroupKFold(n_splits=config.k)
splits = list(gkf.split(X=df, y=df, groups=df["breath_id"]))
for i, (train_idx, val_idx) in enumerate(splits):
if i in config.selected_folds:
print(f"\n------------- Fold {i + 1} / {config.k} -------------\n")
df_train = df.iloc[train_idx].copy().reset_index(drop=True)
df_val = df.iloc[val_idx].copy().reset_index(drop=True)
pred_val, pred_test = train(config, df_train, df_val, df_test, i)
pred_oof[val_idx] = pred_val.flatten()
preds_test.append(pred_test.flatten())
print(f'\n -> CV MAE : {compute_metric(df, pred_oof) :.3f}')
return pred_oof, np.mean(preds_test, 0)
- Group K-fold(순서를 잘 기억)
gkf = GroupKFold(n_splits= n_splits) # 객체 생성
splits = gkf.split(X=X, y=y, groups=group)
for i, (train_idx, val_idx) in enumerate(splits):
print( ' Fold ~~~ ' )
# X_train, X_test, y_train, y_teset 지정
X_train, X_test = ...
y_train, y_test = ...
....
pred_val, pred_test = ... (학습)
....
- Flatten
: n차원 데이터를 1차원으로 바꿔버림
3. Plot_prediction : predict와 실제값을 비교
def plot_prediction(sample_id, df):
df_breath = df[df['breath_id'] == sample_id]
cols = ['u_in', 'u_out', 'pressure'] if 'pressure' in df.columns else ['u_in', 'u_out']
plt.figure(figsize=(12, 4))
for col in ['pred', 'pressure', 'u_out']:
plt.plot(df_breath['time_step'], df_breath[col], label=col)
metric = compute_metric(df_breath, df_breath['pred'])
plt.legend()
plt.title(f'Sample {sample_id} - MAE={metric:.3f}')
4. submission
sub['pressure'] = pred_test
sub.to_csv('submission.csv', index=False)
5. utility
import os
import torch
import random
import numpy as np
def seed_everything(seed):
'''
random seed
Args: seed(int): Number of the seed.
'''
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
def count_parameters(model, all=False):
'''
Counts the parameters of a model.
Args:
model ( torch model ) Model to count the parameters of.
all (bool, optimal): train할 수 없는 매개변수를 count할지에 대한 여부. 디폴트로 false
Returns:
int: Number of parameters
'''
if all:
return sum(p.numel() for p in model.parameters())
else:
return sum(p.numel() for p in model.parameters() if p.requires_grad)
def worker_init_fn(worker_id):
'''
Handles PyTorch x Numpy seeding issues.
Args:
worker_id ( int ): Id of the worker.
'''
np.random.seed(np.random.get_state()[1][0] + worker_id)
def save_model_weights(model, filename, verbose=1, cp_folder=''):
'''
pytorch model의 weight를 save.
Args:
model (torch model ): Model to save the weights of
filename ( str ): Name of checkpoint
verbose (int, optional): Whether to display infos. Defaults to 1.
cp_folder (str, optional): Folder to save to. Defaults to ''.
'''
if verbose:
print(f'\n -> Saving weights to {os.path.join(cp_folder, filename)}\n')
torch.save(model.state_dict(), os.path.join(cp_folder, filename))
참고자료
https://m.blog.naver.com/PostView.naver?isHttpsRedirect=true&blogId=wideeyed&logNo=221653260516