diff --git a/lib/processors/zim_processor.py b/lib/processors/zim_processor.py
index b258408..6f5c887 100644
--- a/lib/processors/zim_processor.py
+++ b/lib/processors/zim_processor.py
@@ -77,10 +77,73 @@ def _text_hash(text):
return hashlib.md5(text.encode('utf-8')).hexdigest()
+def _flatten_table(table_el):
+ """Convert a
element to pipe-delimited text.
+
+ Each becomes a row with cells joined by ' | '.
+ Returns the formatted table as a string with blank lines around it.
+ """
+ rows = []
+ for tr in table_el.iter('tr'):
+ cells = []
+ for cell in tr:
+ if cell.tag in ('td', 'th'):
+ cell_text = (cell.text_content() or '').strip()
+ # Collapse internal whitespace in each cell
+ cell_text = re.sub(r'\s+', ' ', cell_text)
+ if cell_text:
+ cells.append(cell_text)
+ if cells:
+ rows.append(' | '.join(cells))
+ if not rows:
+ return ''
+ return '\n'.join(rows)
+
+
+def _preprocess_tree(doc):
+ """Pre-process HTML tree to add delimiters before text_content() flattens it.
+
+ Handles: ,
, , , -- elements that lxml's
+ text_content() would concatenate without any separators.
+ """
+ from lxml import etree
+
+ # 1. Replace elements with their pipe-delimited text
+ for table in list(doc.iter('table')):
+ formatted = _flatten_table(table)
+ if formatted:
+ replacement = etree.Element('div')
+ replacement.text = '\n\n' + formatted + '\n\n'
+ parent = table.getparent()
+ if parent is not None:
+ parent.replace(table, replacement)
+ else:
+ table.drop_tree()
+
+ # 2.
-> inject newline
+ for br in list(doc.iter('br')):
+ br.tail = '\n' + (br.tail or '')
+
+ # 3. -> inject newline + "- " prefix
+ for li in list(doc.iter('li')):
+ li.text = '- ' + (li.text or '')
+ li.tail = '\n' + (li.tail or '')
+
+ # 4. -> inject newline before
+ for dt in list(doc.iter('dt')):
+ dt.tail = '\n' + (dt.tail or '')
+
+ # 5. -> inject newline + indent
+ for dd in list(doc.iter('dd')):
+ dd.text = ' ' + (dd.text or '')
+ dd.tail = '\n' + (dd.tail or '')
+
+
def _html_to_text(html_bytes):
"""Convert HTML bytes to clean text via lxml.
Strips nav, footer, script, style elements. Decodes entities.
+ Pre-processes tables, lists, and line breaks for proper delimiters.
Normalizes whitespace.
"""
try:
@@ -93,6 +156,9 @@ def _html_to_text(html_bytes):
for el in doc.iter(tag):
el.drop_tree()
+ # Pre-process tree: tables -> pipe-delimited, br -> newlines, li -> dashes
+ _preprocess_tree(doc)
+
# Extract text
text = doc.text_content()