diff --git a/examples/test_split_by_character.ipynb b/examples/test_split_by_character.ipynb new file mode 100644 index 00000000..e8e08b92 --- /dev/null +++ b/examples/test_split_by_character.ipynb @@ -0,0 +1,1296 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "4b5690db12e34685", + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-09T03:40:58.307102Z", + "start_time": "2025-01-09T03:40:51.935233Z" + } + }, + "outputs": [], + "source": [ + "import os\n", + "import logging\n", + "import numpy as np\n", + "from lightrag import LightRAG, QueryParam\n", + "from lightrag.llm import openai_complete_if_cache, openai_embedding\n", + "from lightrag.utils import EmbeddingFunc\n", + "import nest_asyncio" + ] + }, + { + "cell_type": "markdown", + "id": "dd17956ec322b361", + "metadata": {}, + "source": "#### split by character" + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "8c8ee7c061bf9159", + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-09T03:41:13.961167Z", + "start_time": "2025-01-09T03:41:13.958357Z" + } + }, + "outputs": [], + "source": [ + "nest_asyncio.apply()\n", + "WORKING_DIR = \"../../llm_rag/paper_db/R000088_test1\"\n", + "logging.basicConfig(format=\"%(levelname)s:%(message)s\", level=logging.INFO)\n", + "if not os.path.exists(WORKING_DIR):\n", + " os.mkdir(WORKING_DIR)\n", + "API = os.environ.get(\"DOUBAO_API_KEY\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "a5009d16e0851dca", + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-09T03:41:16.862036Z", + "start_time": "2025-01-09T03:41:16.859306Z" + } + }, + "outputs": [], + "source": [ + "async def llm_model_func(\n", + " prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs\n", + ") -> str:\n", + " return await openai_complete_if_cache(\n", + " \"ep-20241218114828-2tlww\",\n", + " prompt,\n", + " system_prompt=system_prompt,\n", + " history_messages=history_messages,\n", + " api_key=API,\n", + " base_url=\"https://ark.cn-beijing.volces.com/api/v3\",\n", + " **kwargs,\n", + " )\n", + "\n", + "\n", + "async def embedding_func(texts: list[str]) -> np.ndarray:\n", + " return await openai_embedding(\n", + " texts,\n", + " model=\"ep-20241231173413-pgjmk\",\n", + " api_key=API,\n", + " base_url=\"https://ark.cn-beijing.volces.com/api/v3\",\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "397fcad24ce4d0ed", + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-09T03:41:24.950307Z", + "start_time": "2025-01-09T03:41:24.940353Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:lightrag:Logger initialized for working directory: ../../llm_rag/paper_db/R000088_test1\n", + "INFO:lightrag:Load KV llm_response_cache with 0 data\n", + "INFO:lightrag:Load KV full_docs with 0 data\n", + "INFO:lightrag:Load KV text_chunks with 0 data\n", + "INFO:nano-vectordb:Init {'embedding_dim': 4096, 'metric': 'cosine', 'storage_file': '../../llm_rag/paper_db/R000088_test1/vdb_entities.json'} 0 data\n", + "INFO:nano-vectordb:Init {'embedding_dim': 4096, 'metric': 'cosine', 'storage_file': '../../llm_rag/paper_db/R000088_test1/vdb_relationships.json'} 0 data\n", + "INFO:nano-vectordb:Init {'embedding_dim': 4096, 'metric': 'cosine', 'storage_file': '../../llm_rag/paper_db/R000088_test1/vdb_chunks.json'} 0 data\n", + "INFO:lightrag:Loaded document status storage with 0 records\n" + ] + } + ], + "source": [ + "rag = LightRAG(\n", + " working_dir=WORKING_DIR,\n", + " llm_model_func=llm_model_func,\n", + " embedding_func=EmbeddingFunc(\n", + " embedding_dim=4096, max_token_size=8192, func=embedding_func\n", + " ),\n", + " chunk_token_size=512,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "1dc3603677f7484d", + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-09T03:41:37.947456Z", + "start_time": "2025-01-09T03:41:37.941901Z" + } + }, + "outputs": [], + "source": [ + "with open(\n", + " \"../../llm_rag/example/R000088/auto/R000088_full_txt.md\", \"r\", encoding=\"utf-8\"\n", + ") as f:\n", + " content = f.read()\n", + "\n", + "\n", + "async def embedding_func(texts: list[str]) -> np.ndarray:\n", + " return await openai_embedding(\n", + " texts,\n", + " model=\"ep-20241231173413-pgjmk\",\n", + " api_key=API,\n", + " base_url=\"https://ark.cn-beijing.volces.com/api/v3\",\n", + " )\n", + "\n", + "\n", + "async def get_embedding_dim():\n", + " test_text = [\"This is a test sentence.\"]\n", + " embedding = await embedding_func(test_text)\n", + " embedding_dim = embedding.shape[1]\n", + " return embedding_dim" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "6844202606acfbe5", + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-09T03:41:39.608541Z", + "start_time": "2025-01-09T03:41:39.165057Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n" + ] + } + ], + "source": [ + "embedding_dimension = await get_embedding_dim()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "d6273839d9681403", + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-09T03:44:34.295345Z", + "start_time": "2025-01-09T03:41:48.324171Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:lightrag:Processing 1 new unique documents\n", + "Processing batch 1: 0%| | 0/1 [00:00标签中,针对每个问题详细分析你的思考过程。然后在<回答>标签中给出所有问题的最终答案。\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "7a6491385b050095", + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-09T03:45:40.829111Z", + "start_time": "2025-01-09T03:45:13.530298Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:lightrag:Local query uses 5 entites, 12 relations, 3 text units\n", + "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:lightrag:Global query uses 8 entites, 5 relations, 4 text units\n", + "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<分析>\n", + "1. **该文献主要研究的问题是什么?**\n", + " - 思考过程:通过浏览论文内容,查找作者明确阐述研究目的的部分。文中多处提及“Our study was performed to explore whether folic acid treatment was associated with cancer outcomes and all-cause mortality after extended follow-up”,表明作者旨在探究叶酸治疗与癌症结局及全因死亡率之间的关系,尤其是在经过长期随访后。\n", + "2. **该文献采用什么方法进行分析?**\n", + " - 思考过程:寻找描述研究方法和数据分析过程的段落。文中提到“Survival curves were constructed using the Kaplan-Meier method and differences in survival between groups were analyzed using the log-rank test. Estimates of hazard ratios (HRs) with 95% CIs were obtained by using Cox proportional hazards regression models stratified by trial”,可以看出作者使用了Kaplan-Meier法构建生存曲线、log-rank检验分析组间生存差异以及Cox比例风险回归模型估计风险比等方法。\n", + "3. **该文献的主要结论是什么?**\n", + " - 思考过程:定位到论文中总结结论的部分,如“Conclusion Treatment with folic acid plus vitamin $\\mathsf{B}_{12}$ was associated with increased cancer outcomes and all-cause mortality in patients with ischemic heart disease in Norway, where there is no folic acid fortification of foods”,可知作者得出叶酸加维生素$\\mathsf{B}_{12}$治疗与癌症结局和全因死亡率增加有关的结论。\n", + "<回答>\n", + "1. 该文献主要研究的问题是:叶酸治疗与癌症结局及全因死亡率之间的关系,尤其是在经过长期随访后,叶酸治疗是否与癌症结局和全因死亡率相关。\n", + "2. 该文献采用的分析方法包括:使用Kaplan-Meier法构建生存曲线、log-rank检验分析组间生存差异、Cox比例风险回归模型估计风险比等。\n", + "3. 该文献的主要结论是:在挪威没有叶酸强化食品的情况下,叶酸加维生素$\\mathsf{B}_{12}$治疗与缺血性心脏病患者的癌症结局和全因死亡率增加有关。\n", + "\n", + "**参考文献**\n", + "- [VD] In2Norwegianhomocysteine-lowering trialsamongpatientswithischemicheart disease, there was a statistically nonsignificantincreaseincancerincidenceinthe groupsassignedtofolicacidtreatment.15,16 Our study was performed to explore whetherfolicacidtreatmentwasassociatedwithcanceroutcomesandall-cause mortality after extended follow-up.\n", + "- [VD] Survivalcurveswereconstructedusing theKaplan-Meiermethodanddifferences insurvivalbetweengroupswereanalyzed usingthelog-ranktest.Estimatesofhazard ratios (HRs) with $95\\%$ CIs were obtainedbyusingCoxproportionalhazards regressionmodelsstratifiedbytrial.\n", + "- [VD] Conclusion Treatment with folic acid plus vitamin $\\mathsf{B}_{12}$ was associated with increased cancer outcomes and all-cause mortality in patients with ischemic heart disease in Norway, where there is no folic acid fortification of foods.\n" + ] + } + ], + "source": [ + "resp = rag.query(prompt1, param=QueryParam(mode=\"mix\", top_k=5))\n", + "print(resp)" + ] + }, + { + "cell_type": "markdown", + "id": "4e5bfad24cb721a8", + "metadata": {}, + "source": "#### split by character only" + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "44e2992dc95f8ce0", + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-09T03:47:40.988796Z", + "start_time": "2025-01-09T03:47:40.982648Z" + } + }, + "outputs": [], + "source": [ + "WORKING_DIR = \"../../llm_rag/paper_db/R000088_test2\"\n", + "if not os.path.exists(WORKING_DIR):\n", + " os.mkdir(WORKING_DIR)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "62c63385d2d973d5", + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-09T03:51:39.951329Z", + "start_time": "2025-01-09T03:49:15.218976Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:lightrag:Logger initialized for working directory: ../../llm_rag/paper_db/R000088_test2\n", + "INFO:lightrag:Load KV llm_response_cache with 0 data\n", + "INFO:lightrag:Load KV full_docs with 0 data\n", + "INFO:lightrag:Load KV text_chunks with 0 data\n", + "INFO:nano-vectordb:Init {'embedding_dim': 4096, 'metric': 'cosine', 'storage_file': '../../llm_rag/paper_db/R000088_test2/vdb_entities.json'} 0 data\n", + "INFO:nano-vectordb:Init {'embedding_dim': 4096, 'metric': 'cosine', 'storage_file': '../../llm_rag/paper_db/R000088_test2/vdb_relationships.json'} 0 data\n", + "INFO:nano-vectordb:Init {'embedding_dim': 4096, 'metric': 'cosine', 'storage_file': '../../llm_rag/paper_db/R000088_test2/vdb_chunks.json'} 0 data\n", + "INFO:lightrag:Loaded document status storage with 0 records\n", + "INFO:lightrag:Processing 1 new unique documents\n", + "Processing batch 1: 0%| | 0/1 [00:00\n", + "- **该文献主要研究的问题是什么?**\n", + " - **思考过程**:通过浏览论文的标题、摘要、引言等部分,寻找关于研究目的和问题的描述。论文标题为“Cancer Incidence and Mortality After Treatment With Folic Acid and Vitamin B12”,摘要中的“Objective”部分明确指出研究目的是“To evaluate effects of treatment with B vitamins on cancer outcomes and all-cause mortality in 2 randomized controlled trials”。因此,可以确定该文献主要研究的问题是评估B族维生素治疗对两项随机对照试验中癌症结局和全因死亡率的影响。\n", + "- **该文献采用什么方法进行分析?**\n", + " - **思考过程**:在论文的“METHODS”部分详细描述了研究方法。文中提到这是一个对两项随机、双盲、安慰剂对照临床试验(Norwegian Vitamin [NORVIT] trial和Western Norway B Vitamin Intervention Trial [WENBIT])数据的联合分析,并进行了观察性的试验后随访。具体包括对参与者进行分组干预(不同剂量的叶酸、维生素B12、维生素B6或安慰剂),收集临床信息和血样,分析循环B族维生素、同型半胱氨酸和可替宁等指标,并进行基因分型等,还涉及到多种统计分析方法,如计算预期癌症发生率、构建生存曲线、进行Cox比例风险回归模型分析等。\n", + "- **该文献的主要结论是什么?**\n", + " - **思考过程**:在论文的“Results”和“Conclusion”部分寻找主要结论。研究结果表明,在治疗期间,接受叶酸加维生素B12治疗的参与者血清叶酸浓度显著增加,且在后续随访中,该组癌症发病率、癌症死亡率和全因死亡率均有所上升,主要是肺癌发病率增加,而维生素B6治疗未显示出显著影响。结论部分明确指出“Treatment with folic acid plus vitamin $\\mathsf{B}_{12}$ was associated with increased cancer outcomes and all-cause mortality in patients with ischemic heart disease in Norway, where there is no folic acid fortification of foods”。\n", + "\n", + "\n", + "<回答>\n", + "- **主要研究问题**:评估B族维生素治疗对两项随机对照试验中癌症结局和全因死亡率的影响。\n", + "- **研究方法**:采用对两项随机、双盲、安慰剂对照临床试验(Norwegian Vitamin [NORVIT] trial和Western Norway B Vitamin Intervention Trial [WENBIT])数据的联合分析,并进行观察性的试验后随访,涉及分组干预、多种指标检测以及多种统计分析方法。\n", + "- **主要结论**:在挪威(食品中未添加叶酸),对于缺血性心脏病患者,叶酸加维生素B12治疗与癌症结局和全因死亡率的增加有关,而维生素B6治疗未显示出显著影响。\n", + "\n", + "**参考文献**\n", + "- [VD] Cancer Incidence and Mortality After Treatment With Folic Acid and Vitamin B12\n", + "- [VD] METHODS Study Design, Participants, and Study Intervention\n", + "- [VD] RESULTS\n", + "- [VD] Conclusion\n", + "- [VD] Objective To evaluate effects of treatment with B vitamins on cancer outcomes and all-cause mortality in 2 randomized controlled trials.\n" + ] + } + ], + "source": [ + "resp = rag.query(prompt1, param=QueryParam(mode=\"mix\", top_k=5))\n", + "print(resp)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ba6fa79a2550d10", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 7496d736..b94ff821 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -314,18 +314,25 @@ class LightRAG: "JsonDocStatusStorage": JsonDocStatusStorage, } - def insert(self, string_or_strings, split_by_character=None): + def insert( + self, string_or_strings, split_by_character=None, split_by_character_only=False + ): loop = always_get_an_event_loop() return loop.run_until_complete( - self.ainsert(string_or_strings, split_by_character) + self.ainsert(string_or_strings, split_by_character, split_by_character_only) ) - async def ainsert(self, string_or_strings, split_by_character): + async def ainsert( + self, string_or_strings, split_by_character, split_by_character_only + ): """Insert documents with checkpoint support Args: string_or_strings: Single document string or list of document strings - split_by_character: if split_by_character is not None, split the string by character + split_by_character: if split_by_character is not None, split the string by character, if chunk longer than + chunk_size, split the sub chunk by token size. + split_by_character_only: if split_by_character_only is True, split the string by character only, when + split_by_character is None, this parameter is ignored. """ if isinstance(string_or_strings, str): string_or_strings = [string_or_strings] @@ -384,6 +391,7 @@ class LightRAG: for dp in chunking_by_token_size( doc["content"], split_by_character=split_by_character, + split_by_character_only=split_by_character_only, overlap_token_size=self.chunk_overlap_token_size, max_token_size=self.chunk_token_size, tiktoken_model=self.tiktoken_model_name, diff --git a/lightrag/operate.py b/lightrag/operate.py index 1128b41c..58ae3703 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -36,6 +36,7 @@ import time def chunking_by_token_size( content: str, split_by_character=None, + split_by_character_only=False, overlap_token_size=128, max_token_size=1024, tiktoken_model="gpt-4o", @@ -45,21 +46,26 @@ def chunking_by_token_size( if split_by_character: raw_chunks = content.split(split_by_character) new_chunks = [] - for chunk in raw_chunks: - _tokens = encode_string_by_tiktoken(chunk, model_name=tiktoken_model) - if len(_tokens) > max_token_size: - for start in range( - 0, len(_tokens), max_token_size - overlap_token_size - ): - chunk_content = decode_tokens_by_tiktoken( - _tokens[start : start + max_token_size], - model_name=tiktoken_model, - ) - new_chunks.append( - (min(max_token_size, len(_tokens) - start), chunk_content) - ) - else: + if split_by_character_only: + for chunk in raw_chunks: + _tokens = encode_string_by_tiktoken(chunk, model_name=tiktoken_model) new_chunks.append((len(_tokens), chunk)) + else: + for chunk in raw_chunks: + _tokens = encode_string_by_tiktoken(chunk, model_name=tiktoken_model) + if len(_tokens) > max_token_size: + for start in range( + 0, len(_tokens), max_token_size - overlap_token_size + ): + chunk_content = decode_tokens_by_tiktoken( + _tokens[start : start + max_token_size], + model_name=tiktoken_model, + ) + new_chunks.append( + (min(max_token_size, len(_tokens) - start), chunk_content) + ) + else: + new_chunks.append((len(_tokens), chunk)) for index, (_len, chunk) in enumerate(new_chunks): results.append( { diff --git a/test.ipynb b/test.ipynb deleted file mode 100644 index 2b9253b4..00000000 --- a/test.ipynb +++ /dev/null @@ -1,740 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "4b5690db12e34685", - "metadata": { - "ExecuteTime": { - "end_time": "2025-01-07T05:38:34.174205Z", - "start_time": "2025-01-07T05:38:29.978194Z" - } - }, - "outputs": [], - "source": [ - "import os\n", - "import logging\n", - "import numpy as np\n", - "from lightrag import LightRAG, QueryParam\n", - "from lightrag.llm import openai_complete_if_cache, openai_embedding\n", - "from lightrag.utils import EmbeddingFunc\n", - "import nest_asyncio" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "8c8ee7c061bf9159", - "metadata": { - "ExecuteTime": { - "end_time": "2025-01-07T05:38:37.440083Z", - "start_time": "2025-01-07T05:38:37.437666Z" - } - }, - "outputs": [], - "source": [ - "nest_asyncio.apply()\n", - "WORKING_DIR = \"../llm_rag/paper_db/R000088_test2\"\n", - "logging.basicConfig(format=\"%(levelname)s:%(message)s\", level=logging.INFO)\n", - "if not os.path.exists(WORKING_DIR):\n", - " os.mkdir(WORKING_DIR)\n", - "os.environ[\"doubao_api\"] = \"6b890250-0cf6-4eb1-aa82-9c9d711398a7\"" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "a5009d16e0851dca", - "metadata": { - "ExecuteTime": { - "end_time": "2025-01-07T05:38:42.594315Z", - "start_time": "2025-01-07T05:38:42.590800Z" - } - }, - "outputs": [], - "source": [ - "async def llm_model_func(\n", - " prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs\n", - ") -> str:\n", - " return await openai_complete_if_cache(\n", - " \"ep-20241218114828-2tlww\",\n", - " prompt,\n", - " system_prompt=system_prompt,\n", - " history_messages=history_messages,\n", - " api_key=os.getenv(\"doubao_api\"),\n", - " base_url=\"https://ark.cn-beijing.volces.com/api/v3\",\n", - " **kwargs,\n", - " )\n", - "\n", - "\n", - "async def embedding_func(texts: list[str]) -> np.ndarray:\n", - " return await openai_embedding(\n", - " texts,\n", - " model=\"ep-20241231173413-pgjmk\",\n", - " api_key=os.getenv(\"doubao_api\"),\n", - " base_url=\"https://ark.cn-beijing.volces.com/api/v3\",\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "397fcad24ce4d0ed", - "metadata": { - "ExecuteTime": { - "end_time": "2025-01-07T05:38:44.016901Z", - "start_time": "2025-01-07T05:38:44.006291Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:lightrag:Logger initialized for working directory: ../llm_rag/paper_db/R000088_test2\n", - "INFO:lightrag:Load KV llm_response_cache with 0 data\n", - "INFO:lightrag:Load KV full_docs with 0 data\n", - "INFO:lightrag:Load KV text_chunks with 0 data\n", - "INFO:nano-vectordb:Init {'embedding_dim': 4096, 'metric': 'cosine', 'storage_file': '../llm_rag/paper_db/R000088_test2/vdb_entities.json'} 0 data\n", - "INFO:nano-vectordb:Init {'embedding_dim': 4096, 'metric': 'cosine', 'storage_file': '../llm_rag/paper_db/R000088_test2/vdb_relationships.json'} 0 data\n", - "INFO:nano-vectordb:Init {'embedding_dim': 4096, 'metric': 'cosine', 'storage_file': '../llm_rag/paper_db/R000088_test2/vdb_chunks.json'} 0 data\n", - "INFO:lightrag:Loaded document status storage with 0 records\n" - ] - } - ], - "source": [ - "rag = LightRAG(\n", - " working_dir=WORKING_DIR,\n", - " llm_model_func=llm_model_func,\n", - " embedding_func=EmbeddingFunc(\n", - " embedding_dim=4096, max_token_size=8192, func=embedding_func\n", - " ),\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "1dc3603677f7484d", - "metadata": { - "ExecuteTime": { - "end_time": "2025-01-07T05:38:47.509111Z", - "start_time": "2025-01-07T05:38:47.501997Z" - } - }, - "outputs": [], - "source": [ - "with open(\n", - " \"../llm_rag/example/R000088/auto/R000088_full_txt.md\", \"r\", encoding=\"utf-8\"\n", - ") as f:\n", - " content = f.read()\n", - "\n", - "\n", - "async def embedding_func(texts: list[str]) -> np.ndarray:\n", - " return await openai_embedding(\n", - " texts,\n", - " model=\"ep-20241231173413-pgjmk\",\n", - " api_key=os.getenv(\"doubao_api\"),\n", - " base_url=\"https://ark.cn-beijing.volces.com/api/v3\",\n", - " )\n", - "\n", - "\n", - "async def get_embedding_dim():\n", - " test_text = [\"This is a test sentence.\"]\n", - " embedding = await embedding_func(test_text)\n", - " embedding_dim = embedding.shape[1]\n", - " return embedding_dim" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "6844202606acfbe5", - "metadata": { - "ExecuteTime": { - "end_time": "2025-01-07T05:38:50.666764Z", - "start_time": "2025-01-07T05:38:50.247712Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n" - ] - } - ], - "source": [ - "embedding_dimension = await get_embedding_dim()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "d6273839d9681403", - "metadata": { - "ExecuteTime": { - "end_time": "2025-01-07T05:42:33.085507Z", - "start_time": "2025-01-07T05:38:56.789348Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:lightrag:Processing 1 new unique documents\n", - "Processing batch 1: 0%| | 0/1 [00:00标签中,针对每个问题详细分析你的思考过程。然后在<回答>标签中给出所有问题的最终答案。\"\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "7a6491385b050095", - "metadata": { - "ExecuteTime": { - "end_time": "2025-01-07T05:43:24.751628Z", - "start_time": "2025-01-07T05:42:50.865679Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n", - "INFO:lightrag:kw_prompt result:\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"high_level_keywords\": [\"英文学术研究论文分析\", \"关键信息提取\", \"深入分析\"],\n", - " \"low_level_keywords\": [\"研究队列\", \"队列名称\", \"队列开展国家\", \"性别分布\", \"年龄分布\", \"队列研究时间线\", \"实际参与研究人数\"]\n", - "}\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n", - "INFO:lightrag:Local query uses 60 entites, 38 relations, 6 text units\n", - "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n", - "INFO:lightrag:Global query uses 72 entites, 60 relations, 4 text units\n", - "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "<分析>\n", - "- **分析对象来自哪些研究队列及是单独分析还是联合分析**:\n", - " 通过查找论文内容,发现文中提到“This is a combined analysis of data from 2 randomized, double-blind, placebo-controlled clinical trials (Norwegian Vitamin [NORVIT] trial15 and Western Norway B Vitamin Intervention Trial [WENBIT]16)”,明确是对两个队列的数据进行联合分析,队列名称分别为“Norwegian Vitamin (NORVIT) trial”和“Western Norway B Vitamin Intervention Trial (WENBIT)”。\n", - "- **队列开展的国家**:\n", - " 文中多次提及研究在挪威进行,如“combined analyses and extended follow-up of 2 vitamin B intervention trials among patients with ischemic heart disease in Norway”,所以确定研究开展的国家是挪威。\n", - "- **队列研究对象的性别分布**:\n", - " 从“Mean (SD) age was 62.3 (11.0) years and 23.5% of participants were women”可知,研究对象包含男性和女性,即全体。\n", - "- **队列收集结束时研究对象年龄分布**:\n", - " 已知“Mean (SD) age was 62.3 (11.0) years”是基线时年龄信息,“Median (interquartile range) duration of extended follow-up through December 31, 2007, was 78 (61 - 90) months”,由于随访的中位时间是78个月(约6.5年),所以可推算队列收集结束时研究对象年龄均值约为62.3 + 6.5 = 68.8岁(标准差仍为11.0年)。\n", - "- **队列研究时间线**:\n", - " 根据“2 randomized, double-blind, placebo-controlled clinical trials (Norwegian Vitamin [NORVIT] trial15 and Western Norway B Vitamin Intervention Trial [WENBIT]16) conducted between 1998 and 2005, and an observational posttrial follow-up through December 31, 2007”可知,队列开始收集信息时间为1998年,结束时间为2007年12月31日。\n", - "- **队列结束时实际参与研究人数**:\n", - " 由“A total of 6837 individuals were included in the combined analyses, of whom 6261 (91.6%) participated in posttrial follow-up”可知,队列结束时实际参与研究人数为6261人。\n", - "\n", - "\n", - "<回答>\n", - "- 分析对象来自“Norwegian Vitamin (NORVIT) trial”和“Western Norway B Vitamin Intervention Trial (WENBIT)”两个研究队列,文中是对这两个队列的数据进行联合分析。\n", - "- 队列开展的国家是挪威。\n", - "- 队列研究对象的性别分布为全体。\n", - "- 队列收集结束时,研究对象年龄分布均值约为68.8岁,标准差为11.0年。\n", - "- 队列研究时间线为1998年开始收集信息/建立队列,2007年12月31日结束。\n", - "- 队列结束时实际参与研究人数是6261人。\n" - ] - } - ], - "source": [ - "print(rag.query(prompt1, param=QueryParam(mode=\"hybrid\")))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fef9d06983da47af", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}