-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathai_code_generator (1).py
More file actions
158 lines (121 loc) · 4.1 KB
/
ai_code_generator (1).py
File metadata and controls
158 lines (121 loc) · 4.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# -*- coding: utf-8 -*-
pip install transformers datasets
pip install transformers[torch] accelerate -U
pip install PyGithub datasets
from github import Github, Auth
import re
from datasets import Dataset
g = Github(auth=Auth.Token("your github token"))
repo = g.get_repo("openai/gym")
def extract_functions_from_code(code):
pattern = re.compile(r"def\s+(\w+)\s*\(.*\):")
functions = pattern.findall(code)
return functions
python_files = []
contents = repo.get_contents("")
while contents:
file_content = contents.pop(0)
if file_content.type == "dir":
contents.extend(repo.get_contents(file_content.path))
elif file_content.path.endswith(".py"):
python_files.append(file_content)
data = {"code": [], "function_name": []}
for file in python_files:
code = file.decoded_content.decode("utf-8")
functions = extract_functions_from_code(code)
for function in functions:
data["code"].append(code)
data["function_name"].append(function)
dataset = Dataset.from_dict(data)
dataset.save_to_disk("code_generation_dataset")
print("Dataset created and saved to disk.")
from datasets import load_from_disk
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-350M-mono")
model = AutoModelForCausalLM.from_pretrained("Salesforce/codegen-350M-mono")
tokenizer.pad_token = tokenizer.eos_token
dataset = load_from_disk("code_generation_dataset")
dataset = dataset.train_test_split(test_size=0.1)
def preprocess_function(examples):
return tokenizer(examples['code'], truncation=True, padding='max_length')
import os
import torch
os.environ["WANDB_DISABLED"] = "true"
torch.cuda.empty_cache()
model.gradient_checkpointing_enable()
model.config.use_cache = False
def preprocess_function(examples):
tokens = tokenizer(
examples["code"],
truncation=True,
padding="max_length",
max_length=512
)
tokens["labels"] = tokens["input_ids"].copy()
return tokens
tokenized_datasets = dataset.map(preprocess_function, batched=True)
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
output_dir="./results",
per_device_train_batch_size=1,
gradient_accumulation_steps=4,
num_train_epochs=1,
fp16=True,
gradient_checkpointing=True,
save_steps=20000,
save_total_limit=1,
report_to="none",
logging_steps=10
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["test"]
)
trainer.train()
!pip install transformers accelerate sentencepiece
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
model_name = "deepseek-ai/deepseek-coder-1.3b-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16,
device_map="auto"
)
def generate_code(prompt, max_length=500):
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(
**inputs,
max_new_tokens=max_length,
temperature=0.2,
do_sample=False
)
generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
return generated
prompt = "Write python code for merge sort:"
result = generate_code(prompt)
print(result)
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
model_name = "deepseek-ai/deepseek-coder-1.3b-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16,
device_map="auto"
)
def generate_code(prompt, max_length=500):
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(
**inputs,
max_new_tokens=max_length,
temperature=0.2,
do_sample=False
)
generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
return generated
prompt = "Write python code for merge binary trees:"
result = generate_code(prompt)
print(result)