From c51b2b38b675d4502e60e1714c6e50f3a8b014fb Mon Sep 17 00:00:00 2001 From: LarFii <834462287@qq.com> Date: Fri, 11 Oct 2024 15:16:43 +0800 Subject: [PATCH] update reproduce --- README.md | 114 +++++++++++++++++++++++++++++++++++++++++++- reproduce/Step_0.py | 63 ++++++++++++++++++++++++ reproduce/Step_1.py | 32 +++++++++++++ reproduce/Step_2.py | 76 +++++++++++++++++++++++++++++ reproduce/Step_3.py | 62 ++++++++++++++++++++++++ 5 files changed, 346 insertions(+), 1 deletion(-) create mode 100644 reproduce/Step_0.py create mode 100644 reproduce/Step_1.py create mode 100644 reproduce/Step_2.py create mode 100644 reproduce/Step_3.py diff --git a/README.md b/README.md index 693f60cf..f70b9d58 100644 --- a/README.md +++ b/README.md @@ -149,7 +149,6 @@ Output your evaluation in the following JSON format: }} ``` ### Overall Performance Table -### Overall Performance Table | | **Agriculture** | | **CS** | | **Legal** | | **Mix** | | |----------------------|-------------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------| | | NaiveRAG | **LightRAG** | NaiveRAG | **LightRAG** | NaiveRAG | **LightRAG** | NaiveRAG | **LightRAG** | @@ -173,6 +172,114 @@ Output your evaluation in the following JSON format: | **Empowerment** | 36.69% | **63.31%** | 45.09% | **54.91%** | 42.81% | **57.19%** | **52.94%** | 47.06% | | **Overall** | 43.62% | **56.38%** | 45.98% | **54.02%** | 45.70% | **54.30%** | **51.86%** | 48.14% | +## Reproduce +All the code can be found in the `./reproduce` directory. +### Step-0 Extract Unique Contexts +First, we need to extract unique contexts in the datasets. +```python +def extract_unique_contexts(input_directory, output_directory): + + os.makedirs(output_directory, exist_ok=True) + + jsonl_files = glob.glob(os.path.join(input_directory, '*.jsonl')) + print(f"Found {len(jsonl_files)} JSONL files.") + + for file_path in jsonl_files: + filename = os.path.basename(file_path) + name, ext = os.path.splitext(filename) + output_filename = f"{name}_unique_contexts.json" + output_path = os.path.join(output_directory, output_filename) + + unique_contexts_dict = {} + + print(f"Processing file: {filename}") + + try: + with open(file_path, 'r', encoding='utf-8') as infile: + for line_number, line in enumerate(infile, start=1): + line = line.strip() + if not line: + continue + try: + json_obj = json.loads(line) + context = json_obj.get('context') + if context and context not in unique_contexts_dict: + unique_contexts_dict[context] = None + except json.JSONDecodeError as e: + print(f"JSON decoding error in file {filename} at line {line_number}: {e}") + except FileNotFoundError: + print(f"File not found: {filename}") + continue + except Exception as e: + print(f"An error occurred while processing file {filename}: {e}") + continue + + unique_contexts_list = list(unique_contexts_dict.keys()) + print(f"There are {len(unique_contexts_list)} unique `context` entries in the file {filename}.") + + try: + with open(output_path, 'w', encoding='utf-8') as outfile: + json.dump(unique_contexts_list, outfile, ensure_ascii=False, indent=4) + print(f"Unique `context` entries have been saved to: {output_filename}") + except Exception as e: + print(f"An error occurred while saving to the file {output_filename}: {e}") + + print("All files have been processed.") + +``` +### Step-1 Insert Contexts +For the extracted contexts, we insert them into the LightRAG system. + +```python +def insert_text(rag, file_path): + with open(file_path, mode='r') as f: + unique_contexts = json.load(f) + + retries = 0 + max_retries = 3 + while retries < max_retries: + try: + rag.insert(unique_contexts) + break + except Exception as e: + retries += 1 + print(f"Insertion failed, retrying ({retries}/{max_retries}), error: {e}") + time.sleep(10) + if retries == max_retries: + print("Insertion failed after exceeding the maximum number of retries") +``` +### Step-2 Generate Queries + +We extract tokens from both the first half and the second half of each context in the dataset, then combine them to generate queries for dataset descriptions. +```python +tokenizer = GPT2Tokenizer.from_pretrained('gpt2') + +def get_summary(context, tot_tokens=2000): + tokens = tokenizer.tokenize(context) + half_tokens = tot_tokens // 2 + + start_tokens = tokens[1000:1000 + half_tokens] + end_tokens = tokens[-(1000 + half_tokens):1000] + + summary_tokens = start_tokens + end_tokens + summary = tokenizer.convert_tokens_to_string(summary_tokens) + + return summary +``` + +### Step-3 Query +For the queries generated in Step-2, we will extract them and query LightRAG. +```python +def extract_queries(file_path): + with open(file_path, 'r') as f: + data = f.read() + + data = data.replace('**', '') + + queries = re.findall(r'- Question \d+: (.+)', data) + + return queries +``` ## Code Structure ```python @@ -191,6 +298,11 @@ Output your evaluation in the following JSON format: │ ├── prompt.py │ ├── storage.py │ └── utils.jpeg +├── reproduce +│ ├── Step_0.py +│ ├── Step_1.py +│ ├── Step_2.py +│ └── Step_3.py ├── LICENSE ├── README.md ├── requirements.txt diff --git a/reproduce/Step_0.py b/reproduce/Step_0.py new file mode 100644 index 00000000..9053aa40 --- /dev/null +++ b/reproduce/Step_0.py @@ -0,0 +1,63 @@ +import os +import json +import glob +import argparse + +def extract_unique_contexts(input_directory, output_directory): + + os.makedirs(output_directory, exist_ok=True) + + jsonl_files = glob.glob(os.path.join(input_directory, '*.jsonl')) + print(f"Found {len(jsonl_files)} JSONL files.") + + for file_path in jsonl_files: + filename = os.path.basename(file_path) + name, ext = os.path.splitext(filename) + output_filename = f"{name}_unique_contexts.json" + output_path = os.path.join(output_directory, output_filename) + + unique_contexts_dict = {} + + print(f"Processing file: {filename}") + + try: + with open(file_path, 'r', encoding='utf-8') as infile: + for line_number, line in enumerate(infile, start=1): + line = line.strip() + if not line: + continue + try: + json_obj = json.loads(line) + context = json_obj.get('context') + if context and context not in unique_contexts_dict: + unique_contexts_dict[context] = None + except json.JSONDecodeError as e: + print(f"JSON decoding error in file {filename} at line {line_number}: {e}") + except FileNotFoundError: + print(f"File not found: {filename}") + continue + except Exception as e: + print(f"An error occurred while processing file {filename}: {e}") + continue + + unique_contexts_list = list(unique_contexts_dict.keys()) + print(f"There are {len(unique_contexts_list)} unique `context` entries in the file {filename}.") + + try: + with open(output_path, 'w', encoding='utf-8') as outfile: + json.dump(unique_contexts_list, outfile, ensure_ascii=False, indent=4) + print(f"Unique `context` entries have been saved to: {output_filename}") + except Exception as e: + print(f"An error occurred while saving to the file {output_filename}: {e}") + + print("All files have been processed.") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('-i', '--input_dir', type=str, default='../datasets') + parser.add_argument('-o', '--output_dir', type=str, default='../datasets/unique_contexts') + + args = parser.parse_args() + + extract_unique_contexts(args.input_dir, args.output_dir) diff --git a/reproduce/Step_1.py b/reproduce/Step_1.py new file mode 100644 index 00000000..08e497cb --- /dev/null +++ b/reproduce/Step_1.py @@ -0,0 +1,32 @@ +import os +import json +import time + +from lightrag import LightRAG + +def insert_text(rag, file_path): + with open(file_path, mode='r') as f: + unique_contexts = json.load(f) + + retries = 0 + max_retries = 3 + while retries < max_retries: + try: + rag.insert(unique_contexts) + break + except Exception as e: + retries += 1 + print(f"Insertion failed, retrying ({retries}/{max_retries}), error: {e}") + time.sleep(10) + if retries == max_retries: + print("Insertion failed after exceeding the maximum number of retries") + +cls = "agriculture" +WORKING_DIR = "../{cls}" + +if not os.path.exists(WORKING_DIR): + os.mkdir(WORKING_DIR) + +rag = LightRAG(working_dir=WORKING_DIR) + +insert_text(rag, f"../datasets/unique_contexts/{cls}_unique_contexts.json") \ No newline at end of file diff --git a/reproduce/Step_2.py b/reproduce/Step_2.py new file mode 100644 index 00000000..b00c19b8 --- /dev/null +++ b/reproduce/Step_2.py @@ -0,0 +1,76 @@ +import os +import json +from openai import OpenAI +from transformers import GPT2Tokenizer + +def openai_complete_if_cache( + model="gpt-4o", prompt=None, system_prompt=None, history_messages=[], **kwargs +) -> str: + openai_client = OpenAI() + + messages = [] + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + messages.extend(history_messages) + messages.append({"role": "user", "content": prompt}) + + response = openai_client.chat.completions.create( + model=model, messages=messages, **kwargs + ) + return response.choices[0].message.content + +tokenizer = GPT2Tokenizer.from_pretrained('gpt2') + +def get_summary(context, tot_tokens=2000): + tokens = tokenizer.tokenize(context) + half_tokens = tot_tokens // 2 + + start_tokens = tokens[1000:1000 + half_tokens] + end_tokens = tokens[-(1000 + half_tokens):1000] + + summary_tokens = start_tokens + end_tokens + summary = tokenizer.convert_tokens_to_string(summary_tokens) + + return summary + + +clses = ['agriculture'] +for cls in clses: + with open(f'../datasets/unique_contexts/{cls}_unique_contexts.json', mode='r') as f: + unique_contexts = json.load(f) + + summaries = [get_summary(context) for context in unique_contexts] + + total_description = "\n\n".join(summaries) + + prompt = f""" + Given the following description of a dataset: + + {total_description} + + Please identify 5 potential users who would engage with this dataset. For each user, list 5 tasks they would perform with this dataset. Then, for each (user, task) combination, generate 5 questions that require a high-level understanding of the entire dataset. + + Output the results in the following structure: + - User 1: [user description] + - Task 1: [task description] + - Question 1: + - Question 2: + - Question 3: + - Question 4: + - Question 5: + - Task 2: [task description] + ... + - Task 5: [task description] + - User 2: [user description] + ... + - User 5: [user description] + ... + """ + + result = openai_complete_if_cache(model='gpt-4o', prompt=prompt) + + file_path = f"../datasets/questions/{cls}_questions.txt" + with open(file_path, "w") as file: + file.write(result) + + print(f"{cls}_questions written to {file_path}") \ No newline at end of file diff --git a/reproduce/Step_3.py b/reproduce/Step_3.py new file mode 100644 index 00000000..f7f7ee30 --- /dev/null +++ b/reproduce/Step_3.py @@ -0,0 +1,62 @@ +import re +import json +import asyncio +from lightrag import LightRAG, QueryParam +from tqdm import tqdm + +def extract_queries(file_path): + with open(file_path, 'r') as f: + data = f.read() + + data = data.replace('**', '') + + queries = re.findall(r'- Question \d+: (.+)', data) + + return queries + +async def process_query(query_text, rag_instance, query_param): + try: + result, context = await rag_instance.aquery(query_text, param=query_param) + return {"query": query_text, "result": result, "context": context}, None + except Exception as e: + return None, {"query": query_text, "error": str(e)} + +def always_get_an_event_loop() -> asyncio.AbstractEventLoop: + try: + loop = asyncio.get_event_loop() + except RuntimeError: + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + return loop + +def run_queries_and_save_to_json(queries, rag_instance, query_param, output_file, error_file): + loop = always_get_an_event_loop() + + with open(output_file, 'a', encoding='utf-8') as result_file, open(error_file, 'a', encoding='utf-8') as err_file: + result_file.write("[\n") + first_entry = True + + for query_text in tqdm(queries, desc="Processing queries", unit="query"): + result, error = loop.run_until_complete(process_query(query_text, rag_instance, query_param)) + + if result: + if not first_entry: + result_file.write(",\n") + json.dump(result, result_file, ensure_ascii=False, indent=4) + first_entry = False + elif error: + json.dump(error, err_file, ensure_ascii=False, indent=4) + err_file.write("\n") + + result_file.write("\n]") + +if __name__ == "__main__": + cls = "agriculture" + mode = "hybird" + WORKING_DIR = "../{cls}" + + rag = LightRAG(working_dir=WORKING_DIR) + query_param = QueryParam(mode=mode) + + queries = extract_queries(f"../datasets/questions/{cls}_questions.txt") + run_queries_and_save_to_json(queries, rag, query_param, "result.json", "errors.json") \ No newline at end of file