fix: handle null bytes (0x00) in text processing

- Fix PostgreSQL encoding error by properly handling null bytes (0x00) in text processing. - The clean_text function now removes null bytes from all input text during the indexing phase.
2025-02-21 13:18:26 +08:00
parent 1749678384
commit bee4622052
1 changed files with 17 additions and 8 deletions
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -474,6 +474,11 @@ class LightRAG:
        storage_class = lazy_external_import(import_path, storage_name)
        return storage_class

+    @staticmethod
+    def clean_text(text: str) -> str:
+        """Clean text by removing null bytes (0x00) and whitespace"""
+        return text.strip().replace('\x00', '')
+
    def insert(
        self,
        input: str | list[str],
@@ -521,8 +526,13 @@ class LightRAG:
    ) -> None:
        update_storage = False
        try:
-            doc_key = compute_mdhash_id(full_text.strip(), prefix="doc-")
-            new_docs = {doc_key: {"content": full_text.strip()}}
+            # Clean input texts
+            full_text = self.clean_text(full_text)
+            text_chunks = [self.clean_text(chunk) for chunk in text_chunks]
+
+            # Process cleaned texts
+            doc_key = compute_mdhash_id(full_text, prefix="doc-")
+            new_docs = {doc_key: {"content": full_text}}

            _add_doc_keys = await self.full_docs.filter_keys(set(doc_key))
            new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
@@ -535,11 +545,10 @@ class LightRAG:

            inserting_chunks: dict[str, Any] = {}
            for chunk_text in text_chunks:
-                chunk_text_stripped = chunk_text.strip()
-                chunk_key = compute_mdhash_id(chunk_text_stripped, prefix="chunk-")
+                chunk_key = compute_mdhash_id(chunk_text, prefix="chunk-")

                inserting_chunks[chunk_key] = {
-                    "content": chunk_text_stripped,
+                    "content": chunk_text,
                    "full_doc_id": doc_key,
                }

@@ -576,8 +585,8 @@ class LightRAG:
        if isinstance(input, str):
            input = [input]

-        # 1. Remove duplicate contents from the list
-        unique_contents = list(set(doc.strip() for doc in input))
+        # Clean input text and remove duplicates
+        unique_contents = list(set(self.clean_text(doc) for doc in input))

        # 2. Generate document IDs and initial status
        new_docs: dict[str, Any] = {
@@ -779,7 +788,7 @@ class LightRAG:
            all_chunks_data: dict[str, dict[str, str]] = {}
            chunk_to_source_map: dict[str, str] = {}
            for chunk_data in custom_kg.get("chunks", {}):
-                chunk_content = chunk_data["content"].strip()
+                chunk_content = self.clean_text(chunk_data["content"])
                source_id = chunk_data["source_id"]
                tokens = len(
                    encode_string_by_tiktoken(