[데이콘 - 상위 18%] 코드 유사성 판단 시즌 2
0. 라이브러리 불러오기 및 저장된 데이터 가져오기
import numpy as np
import pandas as pd
import os
import random
from tqdm import tqdm
from tqdm.auto import tqdm
from datasets import load_metric, load_dataset, Dataset, concatenate_datasets
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
from transformers import AutoConfig, AutoTokenizer, RobertaForSequenceClassification, DataCollatorWithPadding
from transformers import AdamW
from transformers import get_scheduler, get_cosine_with_hard_restarts_schedule_with_warmup, get_linear_schedule_with_warmup
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler, TensorDataset
from torch.optim.lr_scheduler import ReduceLROnPlateau, _LRScheduler
from torch.optim import Adam, AdamW
from torch.optim.optimizer import Optimizer, required
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from sklearn.model_selection import StratifiedKFold
import math
import easydict
import warnings
warnings.filterwarnings("ignore")
2024-12-31 22:22:20.620004: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered 2024-12-31 22:22:20.620044: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered 2024-12-31 22:22:20.646725: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered 2024-12-31 22:22:20.705985: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. 2024-12-31 22:22:21.474864: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
# 시드 고정
def seed_everything(seed=42) :
random.seed(seed)
np.random.seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
print(f"Seed set as {seed}")
seed_everything(42)
Seed set as 42
# 평가지표 출력 함수
def compute_metrics(pred) :
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
acc = accuracy_score(labels, preds)
auc = roc_auc_score(labels, preds)
return {"accuracy" : acc,
"f1" : f1,
"precision" : precision,
"recall" : recall,
"auroc" : auc}
path = "/home/dst78/Code_Similar/result/"
df_train = load_dataset("csv", data_files = path + "train_data_1.csv")["train"]
df_valid = load_dataset("csv", data_files = path + "valid_data_1.csv")["train"]
raw_dataset = concatenate_datasets([df_train, df_valid])
# 코드 관련 트랜스포머 모델 불러오기
tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")
tokenizer.truncation_side = "left"
# 토크나이저로 C언어로 작성된 코드 토큰화
def example_fn(examples) :
outputs = tokenizer(examples["code1"], examples["code2"], padding=True, max_length=512, truncation=True)
if "similar" in examples :
outputs["labels"] = examples["similar"]
return outputs
dataset = raw_dataset.map(example_fn, remove_columns=df_train.column_names)
1. 전처리 및 모델 훈련
sample_train_path = "/home/dst78/Code_Similar/sample_train.csv"
from collections import deque
# 코드 데이터셋 전처리 메서드
# 주로 주석 처리 부분 삭제가 주임임
def preprocess_script(script):
new_script = deque()
in_comment_block = False
for line in script :
if in_comment_block :
if "*/" in line :
in_comment_block = False
line = line[line.index("*/") + 2:]
else :
continue
else :
if "/*" in line :
in_comment_block = True
if "*/" in line :
line = line[:line.index("/*")] + line[line.index("*/") + 2:]
else :
line = line[:line.index("/*")]
if "//" in line :
line = line[:line.index("//")]
line = line.rstrip()
if line :
new_script.append(line)
return new_script
sample_train = pd.read_csv(sample_train_path)
sample_train.head(5)
| code1_path | code2_path | code1 | code2 | similar | |
|---|---|---|---|---|---|
| 0 | ./train_code/problem393/problem393_19.cpp | ./train_code/problem033/problem033_439.cpp | #include <bits/stdc++.h>\n\nusing namespace st... | #include <algorithm>\n#include <bitset>\n#incl... | 0 |
| 1 | ./train_code/problem019/problem019_210.cpp | ./train_code/problem019/problem019_63.cpp | #include <iostream>\n\nusing namespace std;\n\... | #include <iostream>\n#include <string>\nusing ... | 1 |
| 2 | ./train_code/problem107/problem107_486.cpp | ./train_code/problem107/problem107_340.cpp | #include <iostream>\n#include <vector>\nusing ... | #include <cstdio>\n#include <cstdlib>\n#includ... | 1 |
| 3 | ./train_code/problem187/problem187_257.cpp | ./train_code/problem403/problem403_135.cpp | #include <bits/stdc++.h>\n#include <unordered_... | #include <bits/stdc++.h>\nusing namespace std;... | 0 |
| 4 | ./train_code/problem173/problem173_490.cpp | ./train_code/problem173/problem173_345.cpp | #include <bits/stdc++.h>\ntypedef long long ll... | #include "bits/stdc++.h"\n#define rep(i,n) for... | 1 |
preprocess_scripts1 = []
preprocess_scripts2 = []
for code in tqdm(sample_train['code1']):
preprocessed_script = preprocess_script(code)
preprocess_scripts1.append(preprocessed_script)
for code in tqdm(sample_train['code2']):
preprocessed_script = preprocess_script(code)
preprocess_scripts2.append(preprocessed_script)
0%| | 0/20000 [00:00<?, ?it/s]
0%| | 0/20000 [00:00<?, ?it/s]
MAX_LEN = 512
tokens1 = []
tokens2 = []
for code1, code2 in tqdm(zip(sample_train['code1'], sample_train['code2'])):
code1_str, code2_str = str(code1), str(code2)
tokens1.append(tokenizer.tokenize(code1_str, max_length=MAX_LEN, truncation=True))
tokens2.append(tokenizer.tokenize(code2_str, max_length=MAX_LEN, truncation=True))
sample_train["code1_token"] = tokens1
sample_train["code2_token"] = tokens2
0it [00:00, ?it/s]
def drop_column(data):
data = data.drop(["code1_path", "code2_path", "code1", "code2"], axis=1)
return data
sample_train = drop_column(sample_train)
sample_train.head(5)
| similar | code1_token | code2_token | |
|---|---|---|---|
| 0 | 0 | [#, include, Ġ<, bits, /, st, dc, ++, ., h, >,... | [c, ),, end, (, c, ), Ċ, Ċ, template, <, ty, p... |
| 1 | 1 | [#, include, Ġ<, i, ost, ream, >, Ċ, Ċ, using,... | [#, include, Ġ<, i, ost, ream, >, Ċ, #, includ... |
| 2 | 1 | [#, include, Ġ<, i, ost, ream, >, Ċ, #, includ... | [Ġ<, set, >, Ċ, #, include, Ġ<, i, ost, ream, ... |
| 3 | 0 | [01, Ċ, #, define, Ġmod, Ġ100, 000000, 7, Ċ, #... | [#, include, Ġ<, bits, /, st, dc, ++, ., h, >,... |
| 4 | 1 | [#, include, Ġ<, bits, /, st, dc, ++, ., h, >,... | [#, include, Ġ", bits, /, st, dc, ++, ., h, ",... |
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(
sample_train,
sample_train["similar"],
random_state=42,
test_size=0.1,
stratify=sample_train["similar"]
)
X_train = X_train.reset_index(drop=True)
X_valid = X_valid.reset_index(drop=True)
X_train.to_csv("sample_train_result/sample_train_data_1.csv", index=False)
X_valid.to_csv("sample_train_result/sample_valid_data_1.csv", index=False)
sample_path = "/home/dst78/Code_Similar/sample_train_result/"
df_train_sample = load_dataset("csv", data_files = sample_path + "sample_train_data_1.csv")["train"]
df_valid_sample = load_dataset("csv", data_files = sample_path + "sample_valid_data_1.csv")["train"]
sample_raw_dataset = concatenate_datasets([df_train_sample, df_valid_sample])
Generating train split: 0 examples [00:00, ? examples/s]
Generating train split: 0 examples [00:00, ? examples/s]
def example_fn_sample(examples) :
outputs = tokenizer(examples["code1_token"], examples["code2_token"], padding=True, max_length=512, truncation=True)
if "similar" in examples :
outputs["labels"] = examples["similar"]
return outputs
sample_dataset = sample_raw_dataset.map(example_fn_sample, remove_columns=df_train_sample.column_names)
Map: 0%| | 0/20000 [00:00<?, ? examples/s]
save_path = "/home/dst78/Code_Similar/sample_train_result"
args = easydict.EasyDict({
"output_dir" : save_path,
"logging_dir" : save_path,
"per_device_train_batch_size" : 4,
"per_device_eval_batch_size" : 8,
"gradient_accumulation_steps": 2,
"lr" : 2e-5,
"weight_decay" : 0.0,
"epochs" : 3,
"warmup_steps" : 0,
"logging_steps" : 1000,
"save_steps" : 1000,
"evaluation_strategy" : "steps",
"eval_steps" : 1000,
"k_fold" : 5,
})
model = RobertaForSequenceClassification.from_pretrained("microsoft/graphcodebert-base")
tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")
tokenizer.truncation_side = "left"
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
def train_sample(args):
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# model.load_state_dict(torch.load(trained_model_path))
# 해당 부분은 모델 훈련과정이 길어 중간에 체크포인트 설정 후 훈련 중단했을 경우
# 다시 훈련시킬 때 경로를 지정하여 다시 훈련 진행 가능
model.to(device)
model.eval()
gap = int(len(sample_dataset) / args.k_fold)
for i in range(args.k_fold):
print("\n%dth Training" % (i+1))
output_dir = args.output_dir + "_" + str(i+1)
logging_dir = args.logging_dir + "_" + str(i+1)
total_size = len(sample_dataset)
total_ids = list(range(total_size))
del_ids = list(range(i*gap, (i+1)*gap))
training_ids = set(total_ids) - set(del_ids)
training_dataset = sample_dataset.select(list(training_ids))
eval_dataset = sample_dataset.select(list(training_ids))
args.max_steps = args.epochs * len(sample_dataset)
args.save_steps = len(sample_dataset) // 10
args.warmup_steps = args.max_steps // 5
training_args = TrainingArguments(
output_dir=args.output_dir,
overwrite_output_dir=True,
per_device_train_batch_size=args.per_device_train_batch_size,
per_device_eval_batch_size=args.per_device_eval_batch_size,
gradient_accumulation_steps=args.gradient_accumulation_steps,
learning_rate=args.lr,
weight_decay=args.weight_decay,
num_train_epochs=args.epochs,
warmup_steps=args.warmup_steps,
logging_dir=args.logging_dir,
logging_steps=args.logging_steps,
logging_strategy="steps",
save_steps=args.save_steps,
save_total_limit=5,
save_strategy="steps",
evaluation_strategy=args.evaluation_strategy,
eval_steps=args.eval_steps,
load_best_model_at_end=True,
)
collator = DataCollatorWithPadding(tokenizer=tokenizer, max_length=512)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=training_dataset,
eval_dataset=eval_dataset,
data_collator=collator,
compute_metrics=compute_metrics,
callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
)
print("Training Start")
trainer.train()
model_path = "/home/dst78/Code_Similar/train_log/"
model_save_path = os.path.join(model_path, f"model_{i+1}.pt")
torch.save(model.state_dict(), model_save_path)
import gc
gc.collect()
torch.cuda.empty_cache()
train_sample(args)
2. test 예측 및 제출
test_data = pd.read_csv("/home/dst78/Code_Similar/test.csv")
데이터의 양이 많아 절반씩 나누어 진행
test_data = test_data[:297500]
test_data_2 = test_data[297500:]
preprocess_scripts1 = []
preprocess_scripts2 = []
for code in tqdm(test_data['code1']):
preprocessed_script = preprocess_script(code)
preprocess_scripts1.append(preprocessed_script)
for code in tqdm(test_data['code2']):
preprocessed_script = preprocess_script(code)
preprocess_scripts2.append(preprocessed_script)
0%| | 0/297500 [00:00<?, ?it/s]
0%| | 0/297500 [00:00<?, ?it/s]
preprocess_scripts1 = []
preprocess_scripts2 = []
for code in tqdm(test_data_2['code1']):
preprocessed_script = preprocess_script(code)
preprocess_scripts1.append(preprocessed_script)
for code in tqdm(test_data_2['code2']):
preprocessed_script = preprocess_script(code)
preprocess_scripts2.append(preprocessed_script)
0it [00:00, ?it/s]
0it [00:00, ?it/s]
# 먼저의 297,500개의 데이터 토큰화
MAX_LEN = 512
tokens1 = []
tokens2 = []
for code1, code2 in tqdm(zip(test_data['code1'], test_data['code2'])):
code1_str, code2_str = str(code1), str(code2)
tokens1.append(tokenizer.tokenize(code1_str, max_length=MAX_LEN, truncation=True))
tokens2.append(tokenizer.tokenize(code2_str, max_length=MAX_LEN, truncation=True))
test_data["code1_token"] = tokens1
test_data["code2_token"] = tokens2
0it [00:00, ?it/s]
# 다음의 297,500개의 데이터 토큰화
MAX_LEN = 512
tokens1 = []
tokens2 = []
for code1, code2 in tqdm(zip(test_data_2['code1'], test_data_2['code2'])):
code1_str, code2_str = str(code1), str(code2)
tokens1.append(tokenizer.tokenize(code1_str, max_length=MAX_LEN, truncation=True))
tokens2.append(tokenizer.tokenize(code2_str, max_length=MAX_LEN, truncation=True))
test_data_2["code1_token"] = tokens1
test_data_2["code2_token"] = tokens2
0it [00:00, ?it/s]
test_data = test_data.drop(["pair_id", "code1", "code2"], axis=1)
test_data
| code1_token | code2_token | |
|---|---|---|
| 0 | [#, include, Ġ<, bits, /, st, dc, ++, ., h, >,... | [#, include, Ġ<, bits, /, st, dc, ++, ., h, >,... |
| 1 | [#, include, <, bits, /, st, dc, ++, ., h, >, ... | [Ġ<<, Ġend, l, ;, Ċ, //, Ġ, ĉ, }, Ċ, Ġ, Ċ, //,... |
| 2 | [#, include, Ġ<, bits, /, st, dc, ++, ., h, >,... | [Ċ, #, define, Ġp, ob, Ġpop, _, back, Ċ, #, de... |
| 3 | [#, include, Ġ<, bits, /, st, dc, ++, ., h, >,... | [N, -, 1, +, M, );, Ċ, Ġ, Ġ, Ġ, Ġrep, (, i, ,,... |
| 4 | [#, include, <, bits, /, st, dc, ++, ., h, >, ... | [N, *, 2, -, 1, ];, Ċ, void, Ġse, g, use, (){,... |
| ... | ... | ... |
| 297495 | [Ġ}, Ċ, Ġ, Ġ, Ġ, Ġ, Ġ, Ġ, Ġ, Ġ}, ĊĊ, Ġ, Ġ, Ġ, ... | [(, A, ,, ĠB, ,, Ġsizeof, (, A, ));, Ċ, #, def... |
| 297496 | [ãĢ, Ģ, ãĢ, Ģ, ãĢ, Ģ, ï, ¼, ı, ãĢ, Ģ, ãĢ, Ģ, ã... | [#, include, Ġ<, i, ost, ream, >, Ċ, #, includ... |
| 297497 | [#, include, Ġ<, bits, /, st, dc, ++, ., h, >,... | [#, include, <, c, std, io, >, Ċ, #, include, ... |
| 297498 | [#, include, Ġ<, i, ost, ream, >, Ċ, #, includ... | [#, include, Ġ<, bits, /, st, dc, ++, ., h, >,... |
| 297499 | [#, include, Ġ<, bits, /, st, dc, ++, ., h, >,... | [ĠP, Ġ=, Ġpair, <, int, ,, int, >, ;, Ċ, Ċ, /*... |
297500 rows × 2 columns
test_data_2 = test_data_2.drop(["pair_id", "code1", "code2"], axis=1)
test_data_2
| code1_token | code2_token |
|---|
test_data_1과 test_data_2 각각 따로 토큰화하여 합치는 방식을 사용하였다.
따라서 토큰화된 두 데이터셋을 따로 저장 후 합침.
test_data.to_csv("result/test_data_1.csv", index=False)
test_data_2.to_csv("result/test_data_2.csv", index=False)
test_path = "/home/dst78/Code_Similar/result/"
df_test = load_dataset("csv", data_files = test_path + "test_data_1.csv")["train"]
test_path = "/home/dst78/Code_Similar/result/"
df_test_2 = load_dataset("csv", data_files = test_path + "test_data_2.csv")["train"]
'''
model = RobertaForSequenceClassification.from_pretrained("microsoft/graphcodebert-base")
tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")
tokenizer.truncation_side = "left"
trained_model_path = "/home/dst78/Code_Similar/train_log/voting_model_1.pt"
model.load_state_dict(torch.load(trained_model_path))
model.eval()
'''
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
RobertaForSequenceClassification(
(roberta): RobertaModel(
(embeddings): RobertaEmbeddings(
(word_embeddings): Embedding(50265, 768, padding_idx=1)
(position_embeddings): Embedding(514, 768, padding_idx=1)
(token_type_embeddings): Embedding(1, 768)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(encoder): RobertaEncoder(
(layer): ModuleList(
(0-11): 12 x RobertaLayer(
(attention): RobertaAttention(
(self): RobertaSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): RobertaSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): RobertaIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): RobertaOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
)
)
(classifier): RobertaClassificationHead(
(dense): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(out_proj): Linear(in_features=768, out_features=2, bias=True)
)
)
test_dataset = df_test.map(example_fn_sample, remove_columns=["code1_token", "code2_token"])
test_dataset_2 = df_test_2.map(example_fn_sample, remove_columns=["code1_token", "code2_token"])
collator = DataCollatorWithPadding(tokenizer=tokenizer)
testloader = DataLoader(test_dataset,
batch_size=16,
shuffle=False,
collate_fn = collator
)
testloader_2 = DataLoader(test_dataset_2,
batch_size=16,
shuffle=False,
collate_fn = collator
)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
5번 모델이 가장 성능이 좋은 모델(체크포인트)이므로 해당 모델로 테스트셋을 예측하고 제출
all_fold_logits = np.zeros((297500, 2))
for i in tqdm(range(1, 5)) :
model = RobertaForSequenceClassification.from_pretrained("microsoft/graphcodebert-base")
load_path = f"/home/dst78/Code_Similar/train_log/model_5.pt"
model.load_state_dict(torch.load(load_path, map_location=torch.device("cpu")))
model.to(device)
model.eval()
fold_logits = []
progress_bar = tqdm(enumerate(testloader), total=len(testloader), leave=True, position=0,)
for j, data in progress_bar:
with torch.no_grad():
logits = model(
data['input_ids'].to(device),
data['attention_mask'].to(device),
)
logits=logits.logits
fold_logits.append(logits.cpu().numpy())
fold_logits = np.concatenate(fold_logits, axis=0)
all_fold_logits += fold_logits
ensemble_logits = all_fold_logits / 5
predicted_labels = np.argmax(ensemble_logits, axis=1)
all_fold_logits_2 = np.zeros((297500, 2))
for i in tqdm(range(1, 5)) :
model = RobertaForSequenceClassification.from_pretrained("microsoft/graphcodebert-base")
load_path = f"/home/dst78/Code_Similar/train_log/model_5.pt"
model.load_state_dict(torch.load(load_path, map_location=torch.device("cpu")))
model.to(device)
model.eval()
fold_logits = []
progress_bar = tqdm(enumerate(testloader_2), total=len(testloader_2), leave=True, position=0,)
for j, data in progress_bar:
with torch.no_grad():
logits = model(
data['input_ids'].to(device),
data['attention_mask'].to(device),
)
logits=logits.logits
fold_logits.append(logits.cpu().numpy())
fold_logits = np.concatenate(fold_logits, axis=0)
all_fold_logits_2 += fold_logits
ensemble_logits_2 = all_fold_logits_2 / 5
predicted_labels_2 = np.argmax(ensemble_logits_2, axis=1)
soft voting 진행 과정
'''
def majority_voting(predictions, weights):
assert len(predictions) == len(weights)
result_tensor = torch.zeros_like(predictions[0])
for pred, weight in zip(predictions, weights):
weighted_pred = torch.tensor(pred, dtype=torch.float) * weight
result_tensor += weighted_pred
return torch.argmax(result_tensor)
def test_sample(data):
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
weights = [0.9, 0.8, 0.85, 0.88]
all_predictions = []
for i in tqdm(range(0, 4)):
model_path = "/home/dst78/Code_Similar/train_log/"
model_load_path = os.path.join(model_path, f"voting_model_{i+1}.pt")
model.load_state_dict(torch.load(model_load_path))
model.to(device)
model.eval()
with torch.no_grad():
predictions = []
for batch in tqdm(data):
inputs = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
outputs = model(inputs, attention_mask=attention_mask)
predictions.extend(torch.argmax(outputs.logits, dim=1).cpu().tolist())
all_predictions.append(predictions)
final_predictions = []
for preds in zip(*all_predictions):
final_predictions.append(majority_voting(preds, weights))
return final_predictions
final_predictions_1 = test_sample(testloader)
'''
df_submission = pd.DataFrame({'similar': predicted_labels})
df_submission.to_csv('result/sub_high_1.csv', index=False)
df_submission_2 = pd.DataFrame({'similar': predicted_labels_2})
df_submission_2.to_csv('result/sub_high_2.csv', index=False)
sub_path = "/home/dst78/Code_Similar/sample_submission.csv"
sub1_path = "/home/dst78/Code_Similar/result/sub_high_1.csv"
sub2_path = "/home/dst78/Code_Similar/result/sub_high_2.csv"
sub_data = pd.read_csv(sub_path)
sub1_data = pd.read_csv(sub1_path)
sub2_data = pd.read_csv(sub2_path)
combined_sub = pd.concat([sub1_data, sub2_data], ignore_index=True)
sub_data["similar"] = combined_sub["similar"]
sub_data
| pair_id | similar | |
|---|---|---|
| 0 | TEST_000000 | 1 |
| 1 | TEST_000001 | 0 |
| 2 | TEST_000002 | 0 |
| 3 | TEST_000003 | 0 |
| 4 | TEST_000004 | 1 |
| ... | ... | ... |
| 594995 | TEST_594995 | 1 |
| 594996 | TEST_594996 | 0 |
| 594997 | TEST_594997 | 0 |
| 594998 | TEST_594998 | 1 |
| 594999 | TEST_594999 | 0 |
595000 rows × 2 columns
sub_data.to_csv("/home/dst78/Code_Similar/result/submission_3.csv", index=False)