Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: document extractor parsing order for docx files #14406

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 52 additions & 46 deletions api/core/workflow/nodes/document_extractor/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,17 @@
import os
import tempfile
from collections.abc import Mapping, Sequence
from typing import Any, cast
from typing import Any, cast, Union, Iterator

import docx
import pandas as pd
import pypdfium2 # type: ignore
import yaml # type: ignore
from docx.table import Table
from docx import Document
from docx.document import Document as _Document
from docx.table import Table, _Cell
from docx.text.paragraph import Paragraph
from docx.oxml.ns import qn

from configs import dify_config
from core.file import File, FileTransferMethod, file_manager
Expand Down Expand Up @@ -238,60 +241,42 @@ def _extract_text_from_docx(file_content: bytes) -> str:
"""
try:
doc_file = io.BytesIO(file_content)
doc = docx.Document(doc_file)
doc = Document(doc_file)
text = []

# Keep track of paragraph and table positions
content_items: list[tuple[int, str, Table | Paragraph]] = []

# Process paragraphs and tables
for i, paragraph in enumerate(doc.paragraphs):
if paragraph.text.strip():
content_items.append((i, "paragraph", paragraph))
for block in _iter_block_items(doc):
if isinstance(block, Paragraph):
if block.text.strip():
text.append(block.text)
elif isinstance(block, Table):
has_content = any(
cell.text.strip()
for row in block.rows
for cell in row.cells
)

if not has_content:
continue

for i, table in enumerate(doc.tables):
content_items.append((i, "table", table))
try:
header_cells = block.rows[0].cells
header_texts = [cell.text.replace("\n", "<br>") for cell in header_cells]
markdown_table = f"| {' | '.join(header_texts)} |\n"
markdown_table += f"| {' | '.join(['---'] * len(header_cells))} |\n"

# Sort content items based on their original position
content_items.sort(key=operator.itemgetter(0))
for row in block.rows[1:]:
row_texts = [cell.text.replace("\n", "<br>") for cell in row.cells]
markdown_table += f"| {' | '.join(row_texts)} |\n"

# Process sorted content
for _, item_type, item in content_items:
if item_type == "paragraph":
if isinstance(item, Table):
continue
text.append(item.text)
elif item_type == "table":
# Process tables
if not isinstance(item, Table):
continue
try:
# Check if any cell in the table has text
has_content = False
for row in item.rows:
if any(cell.text.strip() for cell in row.cells):
has_content = True
break

if has_content:
cell_texts = [cell.text.replace("\n", "<br>") for cell in item.rows[0].cells]
markdown_table = f"| {' | '.join(cell_texts)} |\n"
markdown_table += f"| {' | '.join(['---'] * len(item.rows[0].cells))} |\n"

for row in item.rows[1:]:
# Replace newlines with <br> in each cell
row_cells = [cell.text.replace("\n", "<br>") for cell in row.cells]
markdown_table += "| " + " | ".join(row_cells) + " |\n"

text.append(markdown_table)
text.append(markdown_table)
except Exception as e:
logger.warning(f"Failed to extract table from DOC: {e}")
logger.warning(f"Failed to extract table from DOCX: {e}")
continue

return "\n".join(text)

except Exception as e:
raise TextExtractionError(f"Failed to extract text from DOCX: {str(e)}") from e
logger.error(f"Failed to extract text from DOCX: {e}")
return ""


def _download_file_content(file: File) -> bytes:
Expand Down Expand Up @@ -440,3 +425,24 @@ def _extract_text_from_msg(file_content: bytes) -> str:
return "\n".join([str(element) for element in elements])
except Exception as e:
raise TextExtractionError(f"Failed to extract text from MSG: {str(e)}") from e


def _iter_block_items(parent: Union[_Document, _Cell]) -> Iterator[Union[Paragraph, Table]]:
"""
Yield each paragraph and table child within *parent*, in document order.
Each returned value is an instance of either Paragraph or Table.
"""
if isinstance(parent, _Document):
parent_elm = parent.element.body
elif isinstance(parent, Table):
parent_elm = parent._element
elif isinstance(parent, _Cell):
parent_elm = parent._tc
else:
raise ValueError("Unsupported parent type")

for child in parent_elm.iterchildren():
if child.tag == qn("w:p"):
yield Paragraph(child, parent)
elif child.tag == qn("w:tbl"):
yield Table(child, parent)
Loading