fix: handle null bytes (0x00) in text processing
- Fix PostgreSQL encoding error by properly handling null bytes (0x00) in text processing. - The clean_text function now removes null bytes from all input text during the indexing phase.
This commit is contained in:
@@ -474,6 +474,11 @@ class LightRAG:
|
|||||||
storage_class = lazy_external_import(import_path, storage_name)
|
storage_class = lazy_external_import(import_path, storage_name)
|
||||||
return storage_class
|
return storage_class
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def clean_text(text: str) -> str:
|
||||||
|
"""Clean text by removing null bytes (0x00) and whitespace"""
|
||||||
|
return text.strip().replace('\x00', '')
|
||||||
|
|
||||||
def insert(
|
def insert(
|
||||||
self,
|
self,
|
||||||
input: str | list[str],
|
input: str | list[str],
|
||||||
@@ -521,8 +526,13 @@ class LightRAG:
|
|||||||
) -> None:
|
) -> None:
|
||||||
update_storage = False
|
update_storage = False
|
||||||
try:
|
try:
|
||||||
doc_key = compute_mdhash_id(full_text.strip(), prefix="doc-")
|
# Clean input texts
|
||||||
new_docs = {doc_key: {"content": full_text.strip()}}
|
full_text = self.clean_text(full_text)
|
||||||
|
text_chunks = [self.clean_text(chunk) for chunk in text_chunks]
|
||||||
|
|
||||||
|
# Process cleaned texts
|
||||||
|
doc_key = compute_mdhash_id(full_text, prefix="doc-")
|
||||||
|
new_docs = {doc_key: {"content": full_text}}
|
||||||
|
|
||||||
_add_doc_keys = await self.full_docs.filter_keys(set(doc_key))
|
_add_doc_keys = await self.full_docs.filter_keys(set(doc_key))
|
||||||
new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
|
new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
|
||||||
@@ -535,11 +545,10 @@ class LightRAG:
|
|||||||
|
|
||||||
inserting_chunks: dict[str, Any] = {}
|
inserting_chunks: dict[str, Any] = {}
|
||||||
for chunk_text in text_chunks:
|
for chunk_text in text_chunks:
|
||||||
chunk_text_stripped = chunk_text.strip()
|
chunk_key = compute_mdhash_id(chunk_text, prefix="chunk-")
|
||||||
chunk_key = compute_mdhash_id(chunk_text_stripped, prefix="chunk-")
|
|
||||||
|
|
||||||
inserting_chunks[chunk_key] = {
|
inserting_chunks[chunk_key] = {
|
||||||
"content": chunk_text_stripped,
|
"content": chunk_text,
|
||||||
"full_doc_id": doc_key,
|
"full_doc_id": doc_key,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -576,8 +585,8 @@ class LightRAG:
|
|||||||
if isinstance(input, str):
|
if isinstance(input, str):
|
||||||
input = [input]
|
input = [input]
|
||||||
|
|
||||||
# 1. Remove duplicate contents from the list
|
# Clean input text and remove duplicates
|
||||||
unique_contents = list(set(doc.strip() for doc in input))
|
unique_contents = list(set(self.clean_text(doc) for doc in input))
|
||||||
|
|
||||||
# 2. Generate document IDs and initial status
|
# 2. Generate document IDs and initial status
|
||||||
new_docs: dict[str, Any] = {
|
new_docs: dict[str, Any] = {
|
||||||
@@ -779,7 +788,7 @@ class LightRAG:
|
|||||||
all_chunks_data: dict[str, dict[str, str]] = {}
|
all_chunks_data: dict[str, dict[str, str]] = {}
|
||||||
chunk_to_source_map: dict[str, str] = {}
|
chunk_to_source_map: dict[str, str] = {}
|
||||||
for chunk_data in custom_kg.get("chunks", {}):
|
for chunk_data in custom_kg.get("chunks", {}):
|
||||||
chunk_content = chunk_data["content"].strip()
|
chunk_content = self.clean_text(chunk_data["content"])
|
||||||
source_id = chunk_data["source_id"]
|
source_id = chunk_data["source_id"]
|
||||||
tokens = len(
|
tokens = len(
|
||||||
encode_string_by_tiktoken(
|
encode_string_by_tiktoken(
|
||||||
|
Reference in New Issue
Block a user