[데이콘 - 상위 18%] 코드 유사성 판단 시즌 2

10 minute read

0. 라이브러리 불러오기 및 저장된 데이터 가져오기

import numpy as np
import pandas as pd
import os
import random

from tqdm import tqdm
from tqdm.auto import tqdm

from datasets import load_metric, load_dataset, Dataset, concatenate_datasets

from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
from transformers import AutoConfig, AutoTokenizer, RobertaForSequenceClassification, DataCollatorWithPadding
from transformers import AdamW
from transformers import get_scheduler, get_cosine_with_hard_restarts_schedule_with_warmup, get_linear_schedule_with_warmup

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler, TensorDataset
from torch.optim.lr_scheduler import ReduceLROnPlateau, _LRScheduler
from torch.optim import Adam, AdamW
from torch.optim.optimizer import Optimizer, required

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from sklearn.model_selection import StratifiedKFold

import math
import easydict

import warnings
warnings.filterwarnings("ignore")

2024-12-31 22:22:20.620004: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-31 22:22:20.620044: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-31 22:22:20.646725: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-31 22:22:20.705985: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-12-31 22:22:21.474864: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT

# 시드 고정

def seed_everything(seed=42) :
    random.seed(seed)
    np.random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    print(f"Seed set as {seed}")
    
seed_everything(42)

Seed set as 42

# 평가지표 출력 함수

def compute_metrics(pred) :
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    
    acc = accuracy_score(labels, preds)
    auc = roc_auc_score(labels, preds)
    
    return {"accuracy" : acc,
            "f1" : f1,
            "precision" : precision,
            "recall" : recall,
            "auroc" : auc}

path = "/home/dst78/Code_Similar/result/"

df_train = load_dataset("csv", data_files = path + "train_data_1.csv")["train"]
df_valid = load_dataset("csv", data_files = path + "valid_data_1.csv")["train"]
raw_dataset = concatenate_datasets([df_train, df_valid])

# 코드 관련 트랜스포머 모델 불러오기

tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")
tokenizer.truncation_side = "left"

# 토크나이저로 C언어로 작성된 코드 토큰화

def example_fn(examples) :
    outputs = tokenizer(examples["code1"], examples["code2"], padding=True, max_length=512, truncation=True)
    if "similar" in examples :
        outputs["labels"] = examples["similar"]
    return outputs

dataset = raw_dataset.map(example_fn, remove_columns=df_train.column_names)

1. 전처리 및 모델 훈련

sample_train_path = "/home/dst78/Code_Similar/sample_train.csv"

from collections import deque

# 코드 데이터셋 전처리 메서드
# 주로 주석 처리 부분 삭제가 주임임

def preprocess_script(script):
    new_script = deque()
    in_comment_block = False

    for line in script :
        if in_comment_block :
            if "*/" in line :
                in_comment_block = False
                line = line[line.index("*/") + 2:]
            else :
                continue
        else :
            if "/*" in line :
                in_comment_block = True
                if "*/" in line :
                    line = line[:line.index("/*")] + line[line.index("*/") + 2:]
                else :
                    line = line[:line.index("/*")]
            
            if "//" in line :
                line = line[:line.index("//")]
                
        line = line.rstrip()
        if line :
            new_script.append(line)
    return new_script

sample_train = pd.read_csv(sample_train_path)
sample_train.head(5)

	code1_path	code2_path	code1	code2	similar
0	./train_code/problem393/problem393_19.cpp	./train_code/problem033/problem033_439.cpp	#include <bits/stdc++.h>\n\nusing namespace st...	#include <algorithm>\n#include <bitset>\n#incl...	0
1	./train_code/problem019/problem019_210.cpp	./train_code/problem019/problem019_63.cpp	#include <iostream>\n\nusing namespace std;\n\...	#include <iostream>\n#include <string>\nusing ...	1
2	./train_code/problem107/problem107_486.cpp	./train_code/problem107/problem107_340.cpp	#include <iostream>\n#include <vector>\nusing ...	#include <cstdio>\n#include <cstdlib>\n#includ...	1
3	./train_code/problem187/problem187_257.cpp	./train_code/problem403/problem403_135.cpp	#include <bits/stdc++.h>\n#include <unordered_...	#include <bits/stdc++.h>\nusing namespace std;...	0
4	./train_code/problem173/problem173_490.cpp	./train_code/problem173/problem173_345.cpp	#include <bits/stdc++.h>\ntypedef long long ll...	#include "bits/stdc++.h"\n#define rep(i,n) for...	1

preprocess_scripts1 = []
preprocess_scripts2 = []

for code in tqdm(sample_train['code1']):
    preprocessed_script = preprocess_script(code)
    preprocess_scripts1.append(preprocessed_script)

for code in tqdm(sample_train['code2']):
    preprocessed_script = preprocess_script(code)
    preprocess_scripts2.append(preprocessed_script)

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

MAX_LEN = 512

tokens1 = []
tokens2 = []

for code1, code2 in tqdm(zip(sample_train['code1'], sample_train['code2'])):
    code1_str, code2_str = str(code1), str(code2)
    
    tokens1.append(tokenizer.tokenize(code1_str, max_length=MAX_LEN, truncation=True))
    tokens2.append(tokenizer.tokenize(code2_str, max_length=MAX_LEN, truncation=True))
    
sample_train["code1_token"] = tokens1
sample_train["code2_token"] = tokens2

0it [00:00, ?it/s]

def drop_column(data):
    data = data.drop(["code1_path", "code2_path", "code1", "code2"], axis=1)
    return data

sample_train = drop_column(sample_train)

sample_train.head(5)

	similar	code1_token	code2_token
0	0	[#, include, Ġ<, bits, /, st, dc, ++, ., h, >,...	[c, ),, end, (, c, ), Ċ, Ċ, template, <, ty, p...
1	1	[#, include, Ġ<, i, ost, ream, >, Ċ, Ċ, using,...	[#, include, Ġ<, i, ost, ream, >, Ċ, #, includ...
2	1	[#, include, Ġ<, i, ost, ream, >, Ċ, #, includ...	[Ġ<, set, >, Ċ, #, include, Ġ<, i, ost, ream, ...
3	0	[01, Ċ, #, define, Ġmod, Ġ100, 000000, 7, Ċ, #...	[#, include, Ġ<, bits, /, st, dc, ++, ., h, >,...
4	1	[#, include, Ġ<, bits, /, st, dc, ++, ., h, >,...	[#, include, Ġ", bits, /, st, dc, ++, ., h, ",...

from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    sample_train,
    sample_train["similar"],
    random_state=42,
    test_size=0.1,
    stratify=sample_train["similar"]
)

X_train = X_train.reset_index(drop=True)
X_valid = X_valid.reset_index(drop=True)

X_train.to_csv("sample_train_result/sample_train_data_1.csv", index=False)
X_valid.to_csv("sample_train_result/sample_valid_data_1.csv", index=False)

sample_path = "/home/dst78/Code_Similar/sample_train_result/"

df_train_sample = load_dataset("csv", data_files = sample_path + "sample_train_data_1.csv")["train"]
df_valid_sample = load_dataset("csv", data_files = sample_path + "sample_valid_data_1.csv")["train"]

sample_raw_dataset = concatenate_datasets([df_train_sample, df_valid_sample])

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

def example_fn_sample(examples) :
    outputs = tokenizer(examples["code1_token"], examples["code2_token"], padding=True, max_length=512, truncation=True)
    if "similar" in examples :
        outputs["labels"] = examples["similar"]
    return outputs

sample_dataset = sample_raw_dataset.map(example_fn_sample, remove_columns=df_train_sample.column_names)

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

save_path = "/home/dst78/Code_Similar/sample_train_result"

args = easydict.EasyDict({
    "output_dir" : save_path,
    "logging_dir" : save_path,
    "per_device_train_batch_size" : 4,
    "per_device_eval_batch_size" : 8,
    "gradient_accumulation_steps": 2,
    "lr" : 2e-5,
    "weight_decay" : 0.0,
    "epochs" : 3,
    "warmup_steps" : 0,
    "logging_steps" : 1000,
    "save_steps" : 1000,
    "evaluation_strategy" : "steps",
    "eval_steps" : 1000,
    "k_fold" : 5,
})

model = RobertaForSequenceClassification.from_pretrained("microsoft/graphcodebert-base")
tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")
tokenizer.truncation_side = "left"

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

def train_sample(args):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    # model.load_state_dict(torch.load(trained_model_path))
    # 해당 부분은 모델 훈련과정이 길어 중간에 체크포인트 설정 후 훈련 중단했을 경우
    # 다시 훈련시킬 때 경로를 지정하여 다시 훈련 진행 가능
    model.to(device)
    model.eval()

    gap = int(len(sample_dataset) / args.k_fold)
    
    for i in range(args.k_fold):
        print("\n%dth Training" % (i+1))
        
        output_dir = args.output_dir + "_" + str(i+1)
        logging_dir = args.logging_dir + "_" + str(i+1)
        
        total_size = len(sample_dataset)
        total_ids = list(range(total_size))
        del_ids = list(range(i*gap, (i+1)*gap))
        training_ids = set(total_ids) - set(del_ids)
        
        training_dataset = sample_dataset.select(list(training_ids))
        eval_dataset = sample_dataset.select(list(training_ids))
        
        args.max_steps = args.epochs * len(sample_dataset)
        args.save_steps = len(sample_dataset) // 10
        args.warmup_steps = args.max_steps // 5
        
        training_args = TrainingArguments(
            output_dir=args.output_dir,
            overwrite_output_dir=True,
            per_device_train_batch_size=args.per_device_train_batch_size,
            per_device_eval_batch_size=args.per_device_eval_batch_size,
            gradient_accumulation_steps=args.gradient_accumulation_steps,
            learning_rate=args.lr,
            weight_decay=args.weight_decay,
            num_train_epochs=args.epochs,
            warmup_steps=args.warmup_steps,
            logging_dir=args.logging_dir,
            logging_steps=args.logging_steps,
            logging_strategy="steps",
            save_steps=args.save_steps,
            save_total_limit=5,
            save_strategy="steps",
            evaluation_strategy=args.evaluation_strategy,
            eval_steps=args.eval_steps,
            load_best_model_at_end=True,
        )
        
        collator = DataCollatorWithPadding(tokenizer=tokenizer, max_length=512)
        
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=training_dataset,
            eval_dataset=eval_dataset,
            data_collator=collator,
            compute_metrics=compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
        )
        
        print("Training Start")
        trainer.train()
        model_path = "/home/dst78/Code_Similar/train_log/"
        model_save_path = os.path.join(model_path, f"model_{i+1}.pt")
        torch.save(model.state_dict(), model_save_path)

import gc

gc.collect()
torch.cuda.empty_cache()

train_sample(args)

2. test 예측 및 제출

test_data = pd.read_csv("/home/dst78/Code_Similar/test.csv")

데이터의 양이 많아 절반씩 나누어 진행

test_data = test_data[:297500]

test_data_2 = test_data[297500:]

preprocess_scripts1 = []
preprocess_scripts2 = []

for code in tqdm(test_data['code1']):
    preprocessed_script = preprocess_script(code)
    preprocess_scripts1.append(preprocessed_script)

for code in tqdm(test_data['code2']):
    preprocessed_script = preprocess_script(code)
    preprocess_scripts2.append(preprocessed_script)

  0%|          | 0/297500 [00:00<?, ?it/s]

  0%|          | 0/297500 [00:00<?, ?it/s]

preprocess_scripts1 = []
preprocess_scripts2 = []

for code in tqdm(test_data_2['code1']):
    preprocessed_script = preprocess_script(code)
    preprocess_scripts1.append(preprocessed_script)

for code in tqdm(test_data_2['code2']):
    preprocessed_script = preprocess_script(code)
    preprocess_scripts2.append(preprocessed_script)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

# 먼저의 297,500개의 데이터 토큰화

MAX_LEN = 512

tokens1 = []
tokens2 = []

for code1, code2 in tqdm(zip(test_data['code1'], test_data['code2'])):
    code1_str, code2_str = str(code1), str(code2)
    
    tokens1.append(tokenizer.tokenize(code1_str, max_length=MAX_LEN, truncation=True))
    tokens2.append(tokenizer.tokenize(code2_str, max_length=MAX_LEN, truncation=True))
    
test_data["code1_token"] = tokens1
test_data["code2_token"] = tokens2

0it [00:00, ?it/s]

# 다음의 297,500개의 데이터 토큰화

MAX_LEN = 512

tokens1 = []
tokens2 = []

for code1, code2 in tqdm(zip(test_data_2['code1'], test_data_2['code2'])):
    code1_str, code2_str = str(code1), str(code2)
    
    tokens1.append(tokenizer.tokenize(code1_str, max_length=MAX_LEN, truncation=True))
    tokens2.append(tokenizer.tokenize(code2_str, max_length=MAX_LEN, truncation=True))
    
test_data_2["code1_token"] = tokens1
test_data_2["code2_token"] = tokens2

0it [00:00, ?it/s]

test_data = test_data.drop(["pair_id", "code1", "code2"], axis=1)
test_data

	code1_token	code2_token
0	[#, include, Ġ<, bits, /, st, dc, ++, ., h, >,...	[#, include, Ġ<, bits, /, st, dc, ++, ., h, >,...
1	[#, include, <, bits, /, st, dc, ++, ., h, >, ...	[Ġ<<, Ġend, l, ;, Ċ, //, Ġ, ĉ, }, Ċ, Ġ, Ċ, //,...
2	[#, include, Ġ<, bits, /, st, dc, ++, ., h, >,...	[Ċ, #, define, Ġp, ob, Ġpop, _, back, Ċ, #, de...
3	[#, include, Ġ<, bits, /, st, dc, ++, ., h, >,...	[N, -, 1, +, M, );, Ċ, Ġ, Ġ, Ġ, Ġrep, (, i, ,,...
4	[#, include, <, bits, /, st, dc, ++, ., h, >, ...	[N, *, 2, -, 1, ];, Ċ, void, Ġse, g, use, (){,...
...	...	...
297495	[Ġ}, Ċ, Ġ, Ġ, Ġ, Ġ, Ġ, Ġ, Ġ, Ġ}, ĊĊ, Ġ, Ġ, Ġ, ...	[(, A, ,, ĠB, ,, Ġsizeof, (, A, ));, Ċ, #, def...
297496	[ãĢ, Ģ, ãĢ, Ģ, ãĢ, Ģ, ï, ¼, ı, ãĢ, Ģ, ãĢ, Ģ, ã...	[#, include, Ġ<, i, ost, ream, >, Ċ, #, includ...
297497	[#, include, Ġ<, bits, /, st, dc, ++, ., h, >,...	[#, include, <, c, std, io, >, Ċ, #, include, ...
297498	[#, include, Ġ<, i, ost, ream, >, Ċ, #, includ...	[#, include, Ġ<, bits, /, st, dc, ++, ., h, >,...
297499	[#, include, Ġ<, bits, /, st, dc, ++, ., h, >,...	[ĠP, Ġ=, Ġpair, <, int, ,, int, >, ;, Ċ, Ċ, /*...

297500 rows × 2 columns

test_data_2 = test_data_2.drop(["pair_id", "code1", "code2"], axis=1)
test_data_2

	code1_token	code2_token

test_data_1과 test_data_2 각각 따로 토큰화하여 합치는 방식을 사용하였다.

따라서 토큰화된 두 데이터셋을 따로 저장 후 합침.

test_data.to_csv("result/test_data_1.csv", index=False)

test_data_2.to_csv("result/test_data_2.csv", index=False)

test_path = "/home/dst78/Code_Similar/result/"

df_test = load_dataset("csv", data_files = test_path + "test_data_1.csv")["train"]

test_path = "/home/dst78/Code_Similar/result/"

df_test_2 = load_dataset("csv", data_files = test_path + "test_data_2.csv")["train"]

'''
model = RobertaForSequenceClassification.from_pretrained("microsoft/graphcodebert-base")
tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")
tokenizer.truncation_side = "left"

trained_model_path = "/home/dst78/Code_Similar/train_log/voting_model_1.pt"
model.load_state_dict(torch.load(trained_model_path))
model.eval()
'''

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): RobertaIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
            (intermediate_act_fn): GELUActivation()
          )
          (output): RobertaOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
    )
  )
  (classifier): RobertaClassificationHead(
    (dense): Linear(in_features=768, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (out_proj): Linear(in_features=768, out_features=2, bias=True)
  )
)

test_dataset = df_test.map(example_fn_sample, remove_columns=["code1_token", "code2_token"])

test_dataset_2 = df_test_2.map(example_fn_sample, remove_columns=["code1_token", "code2_token"])

collator = DataCollatorWithPadding(tokenizer=tokenizer)

testloader = DataLoader(test_dataset,
                          batch_size=16,
                          shuffle=False,
                          collate_fn = collator
                          )

testloader_2 = DataLoader(test_dataset_2,
                          batch_size=16,
                          shuffle=False,
                          collate_fn = collator
                          )

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

5번 모델이 가장 성능이 좋은 모델(체크포인트)이므로 해당 모델로 테스트셋을 예측하고 제출

all_fold_logits = np.zeros((297500, 2))

for i in tqdm(range(1, 5)) :
    model = RobertaForSequenceClassification.from_pretrained("microsoft/graphcodebert-base")
    load_path = f"/home/dst78/Code_Similar/train_log/model_5.pt"
    model.load_state_dict(torch.load(load_path, map_location=torch.device("cpu")))
    model.to(device)
    
    model.eval()
    
    fold_logits = []
    progress_bar = tqdm(enumerate(testloader), total=len(testloader), leave=True, position=0,)
    for j, data in progress_bar:
        with torch.no_grad():
            logits = model(
                data['input_ids'].to(device),
                data['attention_mask'].to(device),
                )
            logits=logits.logits
            fold_logits.append(logits.cpu().numpy())
    
    fold_logits = np.concatenate(fold_logits, axis=0)
    all_fold_logits += fold_logits

ensemble_logits = all_fold_logits / 5
predicted_labels = np.argmax(ensemble_logits, axis=1)

all_fold_logits_2 = np.zeros((297500, 2))

for i in tqdm(range(1, 5)) :
    model = RobertaForSequenceClassification.from_pretrained("microsoft/graphcodebert-base")
    load_path = f"/home/dst78/Code_Similar/train_log/model_5.pt"
    model.load_state_dict(torch.load(load_path, map_location=torch.device("cpu")))
    model.to(device)
    
    model.eval()
    
    fold_logits = []
    progress_bar = tqdm(enumerate(testloader_2), total=len(testloader_2), leave=True, position=0,)
    for j, data in progress_bar:
        with torch.no_grad():
            logits = model(
                data['input_ids'].to(device),
                data['attention_mask'].to(device),
                )
            logits=logits.logits
            fold_logits.append(logits.cpu().numpy())
    
    fold_logits = np.concatenate(fold_logits, axis=0)
    all_fold_logits_2 += fold_logits

ensemble_logits_2 = all_fold_logits_2 / 5
predicted_labels_2 = np.argmax(ensemble_logits_2, axis=1)

soft voting 진행 과정

'''
def majority_voting(predictions, weights):
    assert len(predictions) == len(weights)
    
    result_tensor = torch.zeros_like(predictions[0])
    
    for pred, weight in zip(predictions, weights):
        weighted_pred = torch.tensor(pred, dtype=torch.float) * weight
        result_tensor += weighted_pred
    
    return torch.argmax(result_tensor)

def test_sample(data):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    
    weights = [0.9, 0.8, 0.85, 0.88]
    
    all_predictions = []
    
    for i in tqdm(range(0, 4)):
        model_path = "/home/dst78/Code_Similar/train_log/"
        model_load_path = os.path.join(model_path, f"voting_model_{i+1}.pt")
        
        model.load_state_dict(torch.load(model_load_path))
        model.to(device)
        model.eval()
        
        with torch.no_grad():
            predictions = []
            for batch in tqdm(data):
                inputs = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                outputs = model(inputs, attention_mask=attention_mask)
                predictions.extend(torch.argmax(outputs.logits, dim=1).cpu().tolist())
        
        all_predictions.append(predictions)

    final_predictions = []
    for preds in zip(*all_predictions):
        final_predictions.append(majority_voting(preds, weights))
    
    return final_predictions
    
    final_predictions_1 = test_sample(testloader)
'''

df_submission = pd.DataFrame({'similar': predicted_labels})

df_submission.to_csv('result/sub_high_1.csv', index=False)

df_submission_2 = pd.DataFrame({'similar': predicted_labels_2})

df_submission_2.to_csv('result/sub_high_2.csv', index=False)

sub_path = "/home/dst78/Code_Similar/sample_submission.csv"

sub1_path = "/home/dst78/Code_Similar/result/sub_high_1.csv"
sub2_path = "/home/dst78/Code_Similar/result/sub_high_2.csv"

sub_data = pd.read_csv(sub_path)
sub1_data = pd.read_csv(sub1_path)
sub2_data = pd.read_csv(sub2_path)

combined_sub = pd.concat([sub1_data, sub2_data], ignore_index=True)

sub_data["similar"] = combined_sub["similar"]
sub_data

	pair_id	similar
0	TEST_000000	1
1	TEST_000001	0
2	TEST_000002	0
3	TEST_000003	0
4	TEST_000004	1
...	...	...
594995	TEST_594995	1
594996	TEST_594996	0
594997	TEST_594997	0
594998	TEST_594998	1
594999	TEST_594999	0

595000 rows × 2 columns

sub_data.to_csv("/home/dst78/Code_Similar/result/submission_3.csv", index=False)

Twitter Facebook LinkedIn

[데이콘 - 상위 18%] 코드 유사성 판단 시즌 2

0. 라이브러리 불러오기 및 저장된 데이터 가져오기

1. 전처리 및 모델 훈련

2. test 예측 및 제출

You May Also Enjoy

[AI 부트캠프] LM to LLM - Large Language Model 기초

Attention is all you need

[AI 부트캠프] PyTorch - Implement Deep Learning Models

Building Ethics into Artificial Intelligence