added docs and fields

This commit is contained in:
Yannick Stephan
2025-02-20 13:09:33 +01:00
parent 4b478d1c0f
commit 32d0f1acb0
2 changed files with 36 additions and 12 deletions

View File

@@ -54,9 +54,7 @@ class PostgreSQLDB:
self.pool: Pool | None = None self.pool: Pool | None = None
if self.user is None or self.password is None or self.database is None: if self.user is None or self.password is None or self.database is None:
raise ValueError( raise ValueError("Missing database user, password, or database")
"Missing database user, password, or database"
)
async def initdb(self): async def initdb(self):
try: try:

View File

@@ -225,6 +225,7 @@ def always_get_an_event_loop() -> asyncio.AbstractEventLoop:
asyncio.set_event_loop(new_loop) asyncio.set_event_loop(new_loop)
return new_loop return new_loop
@final @final
@dataclass @dataclass
class LightRAG: class LightRAG:
@@ -271,7 +272,9 @@ class LightRAG:
chunk_token_size: int = field(default=int(os.getenv("CHUNK_SIZE", 1200))) chunk_token_size: int = field(default=int(os.getenv("CHUNK_SIZE", 1200)))
"""Maximum number of tokens per text chunk when splitting documents.""" """Maximum number of tokens per text chunk when splitting documents."""
chunk_overlap_token_size: int = field(default=int(os.getenv("CHUNK_OVERLAP_SIZE", 100))) chunk_overlap_token_size: int = field(
default=int(os.getenv("CHUNK_OVERLAP_SIZE", 100))
)
"""Number of overlapping tokens between consecutive text chunks to preserve context.""" """Number of overlapping tokens between consecutive text chunks to preserve context."""
tiktoken_model_name: str = field(default="gpt-4o-mini") tiktoken_model_name: str = field(default="gpt-4o-mini")
@@ -281,7 +284,9 @@ class LightRAG:
entity_extract_max_gleaning: int = field(default=1) entity_extract_max_gleaning: int = field(default=1)
"""Maximum number of entity extraction attempts for ambiguous content.""" """Maximum number of entity extraction attempts for ambiguous content."""
entity_summary_to_max_tokens: int = field(default=int(os.getenv("MAX_TOKEN_SUMMARY", 500))) entity_summary_to_max_tokens: int = field(
default=int(os.getenv("MAX_TOKEN_SUMMARY", 500))
)
"""Maximum number of tokens used for summarizing extracted entities.""" """Maximum number of tokens used for summarizing extracted entities."""
# Node embedding # Node embedding
@@ -355,12 +360,15 @@ class LightRAG:
auto_manage_storages_states: bool = field(default=True) auto_manage_storages_states: bool = field(default=True)
"""If True, lightrag will automatically calls initialize_storages and finalize_storages at the appropriate times.""" """If True, lightrag will automatically calls initialize_storages and finalize_storages at the appropriate times."""
"""Dictionary for additional parameters and extensions.""" convert_response_to_json_func: Callable[[str], dict[str, Any]] = field(
convert_response_to_json_func: Callable[[str], dict[str, Any]] = ( default_factory=lambda: convert_response_to_json
convert_response_to_json
) )
"""
Custom function for converting LLM responses to JSON format.
The default function is :func:`.utils.convert_response_to_json`.
"""
# Custom Chunking Function
chunking_func: Callable[ chunking_func: Callable[
[ [
str, str,
@@ -371,7 +379,25 @@ class LightRAG:
str, str,
], ],
list[dict[str, Any]], list[dict[str, Any]],
] = chunking_by_token_size ] = field(default_factory=lambda: chunking_by_token_size)
"""
Custom chunking function for splitting text into chunks before processing.
The function should take the following parameters:
- `content`: The text to be split into chunks.
- `split_by_character`: The character to split the text on. If None, the text is split into chunks of `chunk_token_size` tokens.
- `split_by_character_only`: If True, the text is split only on the specified character.
- `chunk_token_size`: The maximum number of tokens per chunk.
- `chunk_overlap_token_size`: The number of overlapping tokens between consecutive chunks.
- `tiktoken_model_name`: The name of the tiktoken model to use for tokenization.
The function should return a list of dictionaries, where each dictionary contains the following keys:
- `tokens`: The number of tokens in the chunk.
- `content`: The text content of the chunk.
Defaults to `chunking_by_token_size` if not specified.
"""
def verify_storage_implementation( def verify_storage_implementation(
self, storage_type: str, storage_name: str self, storage_type: str, storage_name: str