update reproduce
This commit is contained in:
76
reproduce/Step_2.py
Normal file
76
reproduce/Step_2.py
Normal file
@@ -0,0 +1,76 @@
|
||||
import os
|
||||
import json
|
||||
from openai import OpenAI
|
||||
from transformers import GPT2Tokenizer
|
||||
|
||||
def openai_complete_if_cache(
|
||||
model="gpt-4o", prompt=None, system_prompt=None, history_messages=[], **kwargs
|
||||
) -> str:
|
||||
openai_client = OpenAI()
|
||||
|
||||
messages = []
|
||||
if system_prompt:
|
||||
messages.append({"role": "system", "content": system_prompt})
|
||||
messages.extend(history_messages)
|
||||
messages.append({"role": "user", "content": prompt})
|
||||
|
||||
response = openai_client.chat.completions.create(
|
||||
model=model, messages=messages, **kwargs
|
||||
)
|
||||
return response.choices[0].message.content
|
||||
|
||||
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
||||
|
||||
def get_summary(context, tot_tokens=2000):
|
||||
tokens = tokenizer.tokenize(context)
|
||||
half_tokens = tot_tokens // 2
|
||||
|
||||
start_tokens = tokens[1000:1000 + half_tokens]
|
||||
end_tokens = tokens[-(1000 + half_tokens):1000]
|
||||
|
||||
summary_tokens = start_tokens + end_tokens
|
||||
summary = tokenizer.convert_tokens_to_string(summary_tokens)
|
||||
|
||||
return summary
|
||||
|
||||
|
||||
clses = ['agriculture']
|
||||
for cls in clses:
|
||||
with open(f'../datasets/unique_contexts/{cls}_unique_contexts.json', mode='r') as f:
|
||||
unique_contexts = json.load(f)
|
||||
|
||||
summaries = [get_summary(context) for context in unique_contexts]
|
||||
|
||||
total_description = "\n\n".join(summaries)
|
||||
|
||||
prompt = f"""
|
||||
Given the following description of a dataset:
|
||||
|
||||
{total_description}
|
||||
|
||||
Please identify 5 potential users who would engage with this dataset. For each user, list 5 tasks they would perform with this dataset. Then, for each (user, task) combination, generate 5 questions that require a high-level understanding of the entire dataset.
|
||||
|
||||
Output the results in the following structure:
|
||||
- User 1: [user description]
|
||||
- Task 1: [task description]
|
||||
- Question 1:
|
||||
- Question 2:
|
||||
- Question 3:
|
||||
- Question 4:
|
||||
- Question 5:
|
||||
- Task 2: [task description]
|
||||
...
|
||||
- Task 5: [task description]
|
||||
- User 2: [user description]
|
||||
...
|
||||
- User 5: [user description]
|
||||
...
|
||||
"""
|
||||
|
||||
result = openai_complete_if_cache(model='gpt-4o', prompt=prompt)
|
||||
|
||||
file_path = f"../datasets/questions/{cls}_questions.txt"
|
||||
with open(file_path, "w") as file:
|
||||
file.write(result)
|
||||
|
||||
print(f"{cls}_questions written to {file_path}")
|
Reference in New Issue
Block a user