Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ export const createDocumentJson = (docx, converter, editor) => {

// Safety: drop any inline-only nodes that accidentally landed at the doc root
parsedContent = filterOutRootInlineNodes(parsedContent);
parsedContent = normalizeTableBookmarksInContent(parsedContent, editor);
collapseWhitespaceNextToInlinePassthrough(parsedContent);

const result = {
Expand Down Expand Up @@ -844,6 +845,210 @@ export function filterOutRootInlineNodes(content = []) {
return result;
}

/**
* Normalize bookmark nodes that appear as direct table children.
* Moves bookmarkStart/End into the first/last cell textblock of the table.
*
* Some non-conformant DOCX producers place bookmarks as direct table children.
* Per ECMA-376 §17.13.6.2, they should be inside cells (bookmarkStart) or
* as children of rows (bookmarkEnd).
* PM can't accept bookmarks as a direct child of table row and that is why
* we relocate them for compatibility.
*
* @param {Array<{type: string, content?: any[], attrs?: any}>} content
* @param {Editor} [editor]
* @returns {Array}
*/
export function normalizeTableBookmarksInContent(content = [], editor) {
if (!Array.isArray(content) || content.length === 0) return content;

return content.map((node) => normalizeTableBookmarksInNode(node, editor));
}

function normalizeTableBookmarksInNode(node, editor) {
if (!node || typeof node !== 'object') return node;

if (node.type === 'table') {
node = normalizeTableBookmarksInTable(node, editor);
}

if (Array.isArray(node.content)) {
node.content = normalizeTableBookmarksInContent(node.content, editor);
}

return node;
}

function parseColIndex(val) {
if (val == null || val === '') return null;
const n = parseInt(String(val), 10);
return Number.isNaN(n) ? null : Math.max(0, n);
}

/** colFirst/colLast apply only to bookmarkStart; bookmarkEnd always uses first/last cell by position. */
function getCellIndexForBookmark(bookmarkNode, position, rowCellCount) {
if (!rowCellCount) return 0;
if (bookmarkNode?.type === 'bookmarkEnd') {
return position === 'start' ? 0 : rowCellCount - 1;
}
const attrs = bookmarkNode?.attrs ?? {};
const col = parseColIndex(position === 'start' ? attrs.colFirst : attrs.colLast);
if (col == null) return position === 'start' ? 0 : rowCellCount - 1;
return Math.min(col, rowCellCount - 1);
}

function addBookmarkToRowCellInlines(rowCellInlines, rowIndex, position, bookmarkNode, rowCellCount) {
const cellIndex = getCellIndexForBookmark(bookmarkNode, position, rowCellCount);
const bucket = rowCellInlines[rowIndex][position];
if (!bucket[cellIndex]) bucket[cellIndex] = [];
bucket[cellIndex].push(bookmarkNode);
}

function normalizeTableBookmarksInTable(tableNode, editor) {
if (!tableNode || tableNode.type !== 'table' || !Array.isArray(tableNode.content)) return tableNode;

const rows = tableNode.content.filter((child) => child?.type === 'tableRow');
if (!rows.length) return tableNode;

/** @type {{ start: Record<number, unknown[]>, end: Record<number, unknown[]> }[]} */
const rowCellInlines = rows.map(() => ({
start: /** @type {Record<number, unknown[]>} */ ({}),
end: /** @type {Record<number, unknown[]>} */ ({}),
}));
const updatedRows = rows.slice();
let rowCursor = 0;

const normalizedContent = tableNode.content.reduce((acc, child) => {
if (child?.type === 'tableRow') {
acc.push(updatedRows[rowCursor] ?? child);
rowCursor += 1;
return acc;
}

if (isBookmarkNode(child)) {
const prevRowIndex = rowCursor > 0 ? rowCursor - 1 : null;
const nextRowIndex = rowCursor < rows.length ? rowCursor : null;
const row = (nextRowIndex ?? prevRowIndex) != null ? rows[nextRowIndex ?? prevRowIndex] : null;
const rowCellCount = row?.content?.length ?? 0;

if (child.type === 'bookmarkStart') {
if (nextRowIndex != null)
addBookmarkToRowCellInlines(rowCellInlines, nextRowIndex, 'start', child, rowCellCount);
else if (prevRowIndex != null)
addBookmarkToRowCellInlines(rowCellInlines, prevRowIndex, 'end', child, rowCellCount);
} else {
if (prevRowIndex != null) addBookmarkToRowCellInlines(rowCellInlines, prevRowIndex, 'end', child, rowCellCount);
else if (nextRowIndex != null)
addBookmarkToRowCellInlines(rowCellInlines, nextRowIndex, 'start', child, rowCellCount);
}
return acc;
}

acc.push(child);
return acc;
}, []);

updatedRows.forEach((row, index) => {
const { start: startByCell, end: endByCell } = rowCellInlines[index];
const cellIndices = [
...new Set([...Object.keys(startByCell).map(Number), ...Object.keys(endByCell).map(Number)]),
].sort((a, b) => a - b);
for (const cellIndex of cellIndices) {
const startNodes = startByCell[cellIndex];
const endNodes = endByCell[cellIndex];
if (startNodes?.length)
updatedRows[index] = insertInlineIntoRow(updatedRows[index], startNodes, editor, 'start', cellIndex);
if (endNodes?.length)
updatedRows[index] = insertInlineIntoRow(updatedRows[index], endNodes, editor, 'end', cellIndex);
}
});

// Rebuild content with updated rows (preserve any non-row nodes in place).
let updatedRowIndex = 0;
const rebuiltContent = normalizedContent.map((child) => {
if (child?.type === 'tableRow') {
const replacement = updatedRows[updatedRowIndex] ?? child;
updatedRowIndex += 1;
return replacement;
}
return child;
});

return {
...tableNode,
content: rebuiltContent,
};
}

/**
* @param {number} [cellIndex] - If set, insert into this cell; otherwise first (start) or last (end) cell.
*/
function insertInlineIntoRow(rowNode, inlineNodes, editor, position, cellIndex) {
if (!rowNode || !inlineNodes?.length) return rowNode;

if (!Array.isArray(rowNode.content) || rowNode.content.length === 0) {
const paragraph = { type: 'paragraph', content: inlineNodes };
const newCell = { type: 'tableCell', content: [paragraph], attrs: {}, marks: [] };
return { ...rowNode, content: [newCell] };
}

const lastCellIndex = rowNode.content.length - 1;
const targetIndex =
cellIndex != null ? Math.min(Math.max(0, cellIndex), lastCellIndex) : position === 'end' ? lastCellIndex : 0;
const targetCell = rowNode.content[targetIndex];
const updatedCell = insertInlineIntoCell(targetCell, inlineNodes, editor, position);

if (updatedCell === targetCell) return rowNode;

const nextContent = rowNode.content.slice();
nextContent[targetIndex] = updatedCell;
return { ...rowNode, content: nextContent };
}

function findTextblockIndex(content, editor, fromEnd) {
const start = fromEnd ? content.length - 1 : 0;
const end = fromEnd ? -1 : content.length;
const step = fromEnd ? -1 : 1;
for (let i = start; fromEnd ? i > end : i < end; i += step) {
if (isTextblockNode(content[i], editor)) return i;
}
return -1;
}

function insertInlineIntoCell(cellNode, inlineNodes, editor, position) {
if (!cellNode || !inlineNodes?.length) return cellNode;

const content = Array.isArray(cellNode.content) ? cellNode.content.slice() : [];
const targetIndex = findTextblockIndex(content, editor, position === 'end');

if (targetIndex === -1) {
const paragraph = { type: 'paragraph', content: inlineNodes };
if (position === 'end') content.push(paragraph);
else content.unshift(paragraph);
return { ...cellNode, content };
}

const targetBlock = content[targetIndex] || { type: 'paragraph', content: [] };
const blockContent = Array.isArray(targetBlock.content) ? targetBlock.content.slice() : [];
const nextBlockContent = position === 'end' ? blockContent.concat(inlineNodes) : inlineNodes.concat(blockContent);

content[targetIndex] = { ...targetBlock, content: nextBlockContent };
return { ...cellNode, content };
}

function isBookmarkNode(node) {
const typeName = node?.type;
return typeName === 'bookmarkStart' || typeName === 'bookmarkEnd';
}

function isTextblockNode(node, editor) {
const typeName = node?.type;
if (!typeName) return false;
const nodeType = editor?.schema?.nodes?.[typeName];
if (nodeType && typeof nodeType.isTextblock === 'boolean') return nodeType.isTextblock;
return typeName === 'paragraph';
}

/**
* Reconstruct original OOXML for preservable inline nodes using their attribute decoders.
*
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
import { describe, it, expect } from 'vitest';
import { collapseWhitespaceNextToInlinePassthrough, filterOutRootInlineNodes } from './docxImporter.js';
import {
collapseWhitespaceNextToInlinePassthrough,
filterOutRootInlineNodes,
normalizeTableBookmarksInContent,
} from './docxImporter.js';

const n = (type, attrs = {}) => ({ type, attrs, marks: [] });

Expand Down Expand Up @@ -178,3 +182,114 @@ describe('collapseWhitespaceNextToInlinePassthrough', () => {
expect(tree[0].content[2].content[0].text).toBe('bar');
});
});

describe('normalizeTableBookmarksInContent', () => {
const table = (content) => ({ type: 'table', content, attrs: {}, marks: [] });
const row = (cells) => ({ type: 'tableRow', content: cells, attrs: {}, marks: [] });
const cell = (content) => ({ type: 'tableCell', content, attrs: {}, marks: [] });
const paragraph = (content) => ({ type: 'paragraph', content, attrs: {}, marks: [] });
const text = (value) => ({ type: 'text', text: value, marks: [] });
const bookmarkStart = (id, attrs = {}) => ({ type: 'bookmarkStart', attrs: { id, ...attrs } });
const bookmarkEnd = (id, attrs = {}) => ({ type: 'bookmarkEnd', attrs: { id, ...attrs } });

it('moves leading bookmarkStart into the first cell paragraph', () => {
const input = [table([bookmarkStart('b1'), row([cell([paragraph([text('Cell')])])])])];

const result = normalizeTableBookmarksInContent(input);
const normalizedTable = result[0];

expect(normalizedTable.content.some((node) => node.type === 'bookmarkStart')).toBe(false);
const paraContent = normalizedTable.content[0].content[0].content[0].content;
expect(paraContent[0]).toMatchObject({ type: 'bookmarkStart', attrs: { id: 'b1' } });
expect(paraContent[1]).toMatchObject({ type: 'text', text: 'Cell' });
});

it('moves trailing bookmarkEnd into the last cell paragraph', () => {
const input = [table([row([cell([paragraph([text('Cell')])])]), bookmarkEnd('b1')])];

const result = normalizeTableBookmarksInContent(input);
const normalizedTable = result[0];

expect(normalizedTable.content.some((node) => node.type === 'bookmarkEnd')).toBe(false);
const paraContent = normalizedTable.content[0].content[0].content[0].content;
expect(paraContent[0]).toMatchObject({ type: 'text', text: 'Cell' });
expect(paraContent[1]).toMatchObject({ type: 'bookmarkEnd', attrs: { id: 'b1' } });
});

it('moves bookmarkStart and bookmarkEnd into the same cell when no textblocks exist', () => {
const input = [table([bookmarkStart('b1'), row([cell([])]), bookmarkEnd('b1')])];

const result = normalizeTableBookmarksInContent(input);
const normalizedTable = result[0];

expect(normalizedTable.content.some((node) => node.type === 'bookmarkStart')).toBe(false);
expect(normalizedTable.content.some((node) => node.type === 'bookmarkEnd')).toBe(false);

const paraContent = normalizedTable.content[0].content[0].content[0].content;
expect(paraContent).toEqual([
{ type: 'bookmarkStart', attrs: { id: 'b1' } },
{ type: 'bookmarkEnd', attrs: { id: 'b1' } },
]);
});

it('anchors bookmark boundaries to adjacent rows when markers appear between rows', () => {
const input = [
table([
bookmarkStart('b1'),
row([cell([paragraph([text('R1')])])]),
bookmarkEnd('b1'),
row([cell([paragraph([text('R2')])])]),
]),
];

const result = normalizeTableBookmarksInContent(input);
const normalizedTable = result[0];

const row1Content = normalizedTable.content[0].content[0].content[0].content;
expect(row1Content).toEqual([
{ type: 'bookmarkStart', attrs: { id: 'b1' } },
{ type: 'text', text: 'R1', marks: [] },
{ type: 'bookmarkEnd', attrs: { id: 'b1' } },
]);

const row2Content = normalizedTable.content[1].content[0].content[0].content;
expect(row2Content).toEqual([{ type: 'text', text: 'R2', marks: [] }]);
});

it('creates a cell when a row is empty', () => {
const input = [table([bookmarkStart('b1'), row([]), bookmarkEnd('b1')])];

const result = normalizeTableBookmarksInContent(input);
const normalizedTable = result[0];

const rowContent = normalizedTable.content[0].content;
expect(rowContent).toHaveLength(1);
expect(rowContent[0].type).toBe('tableCell');

const paraContent = rowContent[0].content[0].content;
expect(paraContent).toEqual([
{ type: 'bookmarkStart', attrs: { id: 'b1' } },
{ type: 'bookmarkEnd', attrs: { id: 'b1' } },
]);
});

it('places bookmarkStart in the cell indicated by colFirst when present; bookmarkEnd uses first/last cell only', () => {
const twoCells = row([cell([paragraph([text('A')])]), cell([paragraph([text('B')])])]);
const input = [table([bookmarkStart('b1', { colFirst: '1' }), twoCells, bookmarkEnd('b1')])];

const result = normalizeTableBookmarksInContent(input);
const normalizedTable = result[0];
const rowContent = normalizedTable.content[0].content;

expect(normalizedTable.content.some((node) => node.type === 'bookmarkStart')).toBe(false);
expect(normalizedTable.content.some((node) => node.type === 'bookmarkEnd')).toBe(false);

const firstCellContent = rowContent[0].content[0].content;
expect(firstCellContent).toEqual([{ type: 'text', text: 'A', marks: [] }]);

const secondCellContent = rowContent[1].content[0].content;
expect(secondCellContent[0]).toMatchObject({ type: 'bookmarkStart', attrs: { id: 'b1', colFirst: '1' } });
expect(secondCellContent[1]).toMatchObject({ type: 'text', text: 'B', marks: [] });
expect(secondCellContent[2]).toMatchObject({ type: 'bookmarkEnd', attrs: { id: 'b1' } });
});
});
Loading