Skip to content

docx_converter

django_spire.knowledge.entry.version.converters.docx_converter

DocxConverter

Bases: BaseConverter

Converts DocX content to a list of EntryVersionBlocks using Markitdown, AI and the MarkdownConverter.

For more info on Markitdown: https://github.com/microsoft/markitdown

Source code in django_spire/knowledge/entry/version/converters/converter.py
def __init__(self, entry_version: EntryVersion):
    self.entry_version = entry_version

convert_file_to_blocks

Source code in django_spire/knowledge/entry/version/converters/docx_converter.py
def convert_file_to_blocks(self, file: File) -> list[models.EntryVersionBlock]:
    markitdown = MarkItDown()
    markdown_result = markitdown.convert(file.file.path)
    markdown_content = markdown_result.markdown

    markdown_converter = MarkdownConverter(entry_version=self.entry_version)
    return markdown_converter.convert_markdown_to_blocks(
        markdown_content=self.improve_markdown_structure(markdown_content)
    )

improve_markdown_structure staticmethod

Source code in django_spire/knowledge/entry/version/converters/docx_converter.py
@staticmethod
def improve_markdown_structure(markdown_content: str) -> str:
    markdown_content_chunks = [
        markdown_content[i: i + MARKDOWN_AI_CHUNK_SIZE]
        for i in range(0, len(markdown_content), MARKDOWN_AI_CHUNK_SIZE)
    ]

    markdown_format_bot = MarkdownFormatLlmBot()

    with ThreadPoolExecutor(max_workers=4) as executor:
        futures = []
        for idx, chunk in enumerate(markdown_content_chunks):
            future = executor.submit(
                markdown_format_bot.process,
                markdown_content=chunk
            )
            future.index = idx
            futures.append(future)

        improved_chunks = {}
        for future in as_completed(futures):
            try:
                result = future.result(timeout=60)
                improved_chunks[future.index] = result
            except Exception:
                improved_chunks[future.index] = markdown_content_chunks[future.index]

    sorted_improved_chunks = sorted(improved_chunks.items(), key=lambda x: x[0])
    return ''.join(chunk[1] for chunk in sorted_improved_chunks)