Bases: BaseConverter
Converts DocX content to a list of EntryVersionBlocks using Markitdown, AI and
the MarkdownConverter.
For more info on Markitdown:
https://github.com/microsoft/markitdown
Source code in django_spire/knowledge/entry/version/converters/converter.py
| def __init__(self, entry_version: EntryVersion):
self.entry_version = entry_version
|
convert_file_to_blocks
Source code in django_spire/knowledge/entry/version/converters/docx_converter.py
| def convert_file_to_blocks(self, file: File) -> list[models.EntryVersionBlock]:
markitdown = MarkItDown()
markdown_result = markitdown.convert(file.file.path)
markdown_content = markdown_result.markdown
markdown_converter = MarkdownConverter(entry_version=self.entry_version)
return markdown_converter.convert_markdown_to_blocks(
markdown_content=self.improve_markdown_structure(markdown_content)
)
|
improve_markdown_structure
staticmethod
Source code in django_spire/knowledge/entry/version/converters/docx_converter.py
| @staticmethod
def improve_markdown_structure(markdown_content: str) -> str:
markdown_content_chunks = [
markdown_content[i: i + MARKDOWN_AI_CHUNK_SIZE]
for i in range(0, len(markdown_content), MARKDOWN_AI_CHUNK_SIZE)
]
markdown_format_bot = MarkdownFormatLlmBot()
with ThreadPoolExecutor(max_workers=4) as executor:
futures = []
for idx, chunk in enumerate(markdown_content_chunks):
future = executor.submit(
markdown_format_bot.process,
markdown_content=chunk
)
future.index = idx
futures.append(future)
improved_chunks = {}
for future in as_completed(futures):
try:
result = future.result(timeout=60)
improved_chunks[future.index] = result
except Exception:
improved_chunks[future.index] = markdown_content_chunks[future.index]
sorted_improved_chunks = sorted(improved_chunks.items(), key=lambda x: x[0])
return ''.join(chunk[1] for chunk in sorted_improved_chunks)
|