From 6ad6ee380ee4d6b151532a49b89aca5ace2b5707 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Mon, 14 Jul 2025 17:57:53 +0800 Subject: [PATCH 01/76] init segment --- packages/poml/reader/segment.ts | 209 +++++++++++++++- packages/poml/tests/segment.test.ts | 353 ++++++++++++++++++++++++++++ 2 files changed, 560 insertions(+), 2 deletions(-) create mode 100644 packages/poml/tests/segment.test.ts diff --git a/packages/poml/reader/segment.ts b/packages/poml/reader/segment.ts index cf4e4425..2218c086 100644 --- a/packages/poml/reader/segment.ts +++ b/packages/poml/reader/segment.ts @@ -1,3 +1,5 @@ +import componentDocs from '../assets/componentDocs.json'; + export interface Segment { // Unique ID for caching and React keys id: string; @@ -16,6 +18,209 @@ export interface Segment { tagName?: string; } -export function createSegments(content: string, path?: string): Segment[] { - throw new Error('createSegments is not implemented yet'); +class Segmenter { + private nextId: number; + private sourcePath: string | undefined; + + constructor(sourcePath: string | undefined) { + this.nextId = 0; + this.sourcePath = sourcePath; + } + + private generateId(): string { + return `segment_${this.nextId++}`; + } + + private isValidPomlTag(tagName: string): boolean { + const validTags = new Set(); + + for (const doc of componentDocs) { + if (doc.name) { + validTags.add(doc.name.toLowerCase()); + validTags.add(doc.name.toLowerCase().replace(/([A-Z])/g, '-$1').toLowerCase()); + } + } + + validTags.add('poml'); + validTags.add('text'); + validTags.add('meta'); + + return validTags.has(tagName.toLowerCase()); + } + + private parseSegments(text: string, start: number = 0, parent?: Segment): Segment[] { + const segments: Segment[] = []; + let currentPos = start; + + while (currentPos < text.length) { + const nextOpenTag = text.indexOf('<', currentPos); + + if (nextOpenTag === -1) { + if (currentPos < text.length) { + const textContent = text.substring(currentPos); + if (textContent.trim()) { + segments.push({ + id: this.generateId(), + kind: 'TEXT', + start: currentPos, + end: text.length, + content: textContent, + path: this.sourcePath, + parent, + children: [] + }); + } + } + break; + } + + if (nextOpenTag > currentPos) { + const textContent = text.substring(currentPos, nextOpenTag); + if (textContent.trim()) { + segments.push({ + id: this.generateId(), + kind: 'TEXT', + start: currentPos, + end: nextOpenTag, + content: textContent, + path: this.sourcePath, + parent, + children: [] + }); + } + } + + const tagEndPos = text.indexOf('>', nextOpenTag); + if (tagEndPos === -1) { + currentPos = nextOpenTag + 1; + continue; + } + + const tagContent = text.substring(nextOpenTag + 1, tagEndPos); + const tagName = tagContent.trim().split(/\s+/)[0]; + + if (tagName.startsWith('/')) { + currentPos = tagEndPos + 1; + continue; + } + + if (tagContent.endsWith('/')) { + currentPos = tagEndPos + 1; + continue; + } + + if (!this.isValidPomlTag(tagName)) { + currentPos = tagEndPos + 1; + continue; + } + + const closingTag = ``; + const closingTagPos = this.findClosingTag(text, tagName, tagEndPos + 1); + + if (closingTagPos === -1) { + currentPos = tagEndPos + 1; + continue; + } + + const segmentContent = text.substring(nextOpenTag, closingTagPos + closingTag.length); + const innerContent = text.substring(tagEndPos + 1, closingTagPos); + + const segment: Segment = { + id: this.generateId(), + kind: tagName.toLowerCase() === 'meta' ? 'META' : 'POML', + start: nextOpenTag, + end: closingTagPos + closingTag.length, + content: segmentContent, + path: this.sourcePath, + parent, + children: [], + tagName: tagName.toLowerCase() + }; + + if (tagName.toLowerCase() === 'text') { + segment.children = this.parseSegments(innerContent, tagEndPos + 1, segment); + } else if (tagName.toLowerCase() !== 'meta') { + const childSegments = this.parseSegments(innerContent, tagEndPos + 1, segment); + segment.children = childSegments; + } + + segments.push(segment); + currentPos = closingTagPos + closingTag.length; + } + + return segments; + } + + private findClosingTag(text: string, tagName: string, startPos: number): number { + let depth = 1; + let pos = startPos; + + while (pos < text.length && depth > 0) { + const nextTag = text.indexOf('<', pos); + if (nextTag === -1) { + break; + } + + const tagEndPos = text.indexOf('>', nextTag); + if (tagEndPos === -1) { + break; + } + + const tagContent = text.substring(nextTag + 1, tagEndPos); + const currentTagName = tagContent.trim().split(/\s+/)[0]; + + if (currentTagName === tagName) { + depth++; + } else if (currentTagName === `/${tagName}`) { + depth--; + } + + pos = tagEndPos + 1; + } + + return depth === 0 ? pos - (``.length) : -1; + } + + public createSegments(content: string): Segment { + const rootSegments = this.parseSegments(content); + + if (rootSegments.length === 1 && rootSegments[0].kind === 'POML') { + return rootSegments[0]; + } + + if (rootSegments.length === 0) { + return { + id: this.generateId(), + kind: 'TEXT', + start: 0, + end: content.length, + content: content, + path: this.sourcePath, + children: [], + parent: undefined + }; + } + + const rootSegment: Segment = { + id: this.generateId(), + kind: 'TEXT', + start: 0, + end: content.length, + content: content, + path: this.sourcePath, + children: rootSegments, + parent: undefined + }; + + rootSegments.forEach(segment => { + segment.parent = rootSegment; + }); + + return rootSegment; + } +} + +export function createSegments(content: string, sourcePath?: string): Segment { + const segmenter = new Segmenter(sourcePath); + return segmenter.createSegments(content); } diff --git a/packages/poml/tests/segment.test.ts b/packages/poml/tests/segment.test.ts new file mode 100644 index 00000000..9c5d850e --- /dev/null +++ b/packages/poml/tests/segment.test.ts @@ -0,0 +1,353 @@ +import { describe, expect, test } from '@jest/globals'; +import { createSegments, Segment } from '../reader/segment'; + +describe('createSegments', () => { + test('pure text content', () => { + const content = 'This is pure text content with no POML tags.'; + const segment = createSegments(content); + + expect(segment.kind).toBe('TEXT'); + expect(segment.content).toBe(content); + expect(segment.start).toBe(0); + expect(segment.end).toBe(content.length); + expect(segment.children).toHaveLength(0); + }); + + test('single POML tag', () => { + const content = 'Analyze the data'; + const segment = createSegments(content); + + expect(segment.kind).toBe('POML'); + expect(segment.tagName).toBe('task'); + expect(segment.content).toBe(content); + expect(segment.start).toBe(0); + expect(segment.end).toBe(content.length); + }); + + test('mixed content with text and POML', () => { + const content = `# My Analysis Document + +This is a regular markdown document that explains the task. + + + Analyze the following data and provide insights. + + +Here are some key points to consider: + +- Data quality +- Statistical significance +- Business impact`; + + const segment = createSegments(content); + + expect(segment.kind).toBe('TEXT'); + expect(segment.children).toHaveLength(4); + + const children = segment.children; + expect(children[0].kind).toBe('TEXT'); + expect(children[0].content).toContain('# My Analysis Document'); + + expect(children[1].kind).toBe('POML'); + expect(children[1].tagName).toBe('task'); + expect(children[1].content).toBe(` + Analyze the following data and provide insights. +`); + + expect(children[2].kind).toBe('TEXT'); + expect(children[2].content).toContain('Here are some key points'); + + expect(children[3].kind).toBe('TEXT'); + expect(children[3].content).toContain('- Data quality'); + }); + + test('nested POML segments', () => { + const content = ` + + Sample data point 1 + Analysis result 1 + +`; + + const segment = createSegments(content); + + expect(segment.kind).toBe('POML'); + expect(segment.tagName).toBe('examples'); + expect(segment.children).toHaveLength(2); + + const exampleSegment = segment.children.find(c => c.kind === 'POML' && c.tagName === 'example'); + expect(exampleSegment).toBeDefined(); + expect(exampleSegment!.children).toHaveLength(3); + + const inputSegment = exampleSegment!.children.find(c => c.kind === 'POML' && c.tagName === 'input'); + const outputSegment = exampleSegment!.children.find(c => c.kind === 'POML' && c.tagName === 'output'); + + expect(inputSegment).toBeDefined(); + expect(outputSegment).toBeDefined(); + }); + + test('text tag with nested content', () => { + const content = ` + Process the following data + + This is **markdown** content that will be processed as pure text. + + - Item 1 + - Item 2 + + This is a nested POML component that will be processed as POML. + + No POML processing happens here. + + Remember to check the format +`; + + const segment = createSegments(content); + + expect(segment.kind).toBe('POML'); + expect(segment.tagName).toBe('poml'); + expect(segment.children).toHaveLength(4); + + const textSegment = segment.children.find(c => c.kind === 'POML' && c.tagName === 'text'); + expect(textSegment).toBeDefined(); + expect(textSegment!.children).toHaveLength(3); + + const nestedCpSegment = textSegment!.children.find(c => c.kind === 'POML' && c.tagName === 'cp'); + expect(nestedCpSegment).toBeDefined(); + expect(nestedCpSegment!.content).toBe('This is a nested POML component that will be processed as POML.'); + }); + + test('meta tags', () => { + const content = `John Doe + + { "task": { "captionStyle": "bold" } } + + + +Complete the analysis`; + + const segment = createSegments(content); + + expect(segment.kind).toBe('TEXT'); + expect(segment.children).toHaveLength(3); + + const metaSegment = segment.children.find(c => c.kind === 'META'); + expect(metaSegment).toBeDefined(); + expect(metaSegment!.tagName).toBe('meta'); + expect(metaSegment!.children).toHaveLength(0); + + const taskSegment = segment.children.find(c => c.kind === 'POML' && c.tagName === 'task'); + expect(taskSegment).toBeDefined(); + }); + + test('invalid tags are ignored', () => { + const content = `This should be ignored +This should be processed +This should also be ignored`; + + const segment = createSegments(content); + + expect(segment.kind).toBe('TEXT'); + expect(segment.children).toHaveLength(3); + + const taskSegment = segment.children.find(c => c.kind === 'POML'); + expect(taskSegment).toBeDefined(); + expect(taskSegment!.tagName).toBe('task'); + + const textSegments = segment.children.filter(c => c.kind === 'TEXT'); + expect(textSegments).toHaveLength(2); + expect(textSegments[0].content).toContain('This should be ignored'); + expect(textSegments[1].content).toContain('This should also be ignored'); + }); + + test('self-closing tags are ignored', () => { + const content = `Valid task +
+ +Valid hint`; + + const segment = createSegments(content); + + expect(segment.kind).toBe('TEXT'); + expect(segment.children).toHaveLength(4); + + const pomlSegments = segment.children.filter(c => c.kind === 'POML'); + expect(pomlSegments).toHaveLength(3); + expect(pomlSegments[0].tagName).toBe('task'); + expect(pomlSegments[2].tagName).toBe('hint'); + }); + + test('malformed tags are handled gracefully', () => { + const content = `Incomplete tag +Complete hint +This has no closing tag`; + + const segment = createSegments(content); + + expect(segment.kind).toBe('TEXT'); + expect(segment.children).toHaveLength(3); + + const hintSegment = segment.children.find(c => c.kind === 'POML' && c.tagName === 'hint'); + expect(hintSegment).toBeDefined(); + expect(hintSegment!.content).toBe('Complete hint'); + + const textSegments = segment.children.filter(c => c.kind === 'TEXT'); + expect(textSegments).toHaveLength(2); + expect(textSegments[0].content).toBe('Incomplete tag\n'); + expect(textSegments[1].content).toBe('\nThis has no closing tag'); + }); + + test('malformed POML tags are ignored', () => { + const content = `Valid task`; + const segment = createSegments(content); + + expect(segment.kind).toBe('TEXT'); + expect(segment.children).toHaveLength(0); + }); + + test('empty content', () => { + const content = ''; + const segment = createSegments(content); + + }); + + test('whitespace-only content', () => { + const content = ' \n\n\t \n '; + const segment = createSegments(content); + + expect(segment.kind).toBe('TEXT'); + expect(segment.content).toBe(content); + expect(segment.children).toHaveLength(0); + }); + + test('hyphenated tag names', () => { + const content = `JSON format +System message +User message`; + + const segment = createSegments(content); + + expect(segment.kind).toBe('TEXT'); + expect(segment.children).toHaveLength(4); + + const pomlSegments = segment.children.filter(c => c.kind === 'POML'); + expect(pomlSegments).toHaveLength(3); + expect(pomlSegments[0].tagName).toBe('output-format'); + expect(pomlSegments[1].tagName).toBe('system-msg'); + expect(pomlSegments[2].tagName).toBe('user-msg'); + }); + + test('parent-child relationships', () => { + const content = ` + This is a hint + Some text + + Example 1 + +`; + + const segment = createSegments(content); + + const taskSegment = segment; + expect(taskSegment.kind).toBe('POML'); + expect(taskSegment.tagName).toBe('task'); + expect(taskSegment.parent).toBeUndefined(); + + const hintSegment = taskSegment.children.find(c => c.kind === 'POML' && c.tagName === 'hint'); + expect(hintSegment).toBeDefined(); + expect(hintSegment!.parent).toBe(taskSegment); + + const examplesSegment = taskSegment.children.find(c => c.kind === 'POML' && c.tagName === 'examples'); + expect(examplesSegment).toBeDefined(); + expect(examplesSegment!.parent).toBe(taskSegment); + + const exampleSegment = examplesSegment!.children.find(c => c.kind === 'POML' && c.tagName === 'example'); + expect(exampleSegment).toBeDefined(); + expect(exampleSegment!.parent).toBe(examplesSegment); + }); + + test('segment IDs are unique', () => { + const content = `First task +Second task +A hint`; + + const segment = createSegments(content); + expect(segment.kind).toBe('TEXT'); + expect(segment.children).toHaveLength(5); + + function collectAllSegments(segment: Segment): Segment[] { + const all = [segment]; + segment.children.forEach(child => { + all.push(...collectAllSegments(child)); + }); + return all; + } + + const allSegments = collectAllSegments(segment); + const ids = allSegments.map(s => s.id); + const uniqueIds = new Set(ids); + + expect(uniqueIds.size).toBe(ids.length); + }); + + test('path parameter is preserved', () => { + const content = 'Test task'; + const path = '/test/path/file.poml'; + const segment = createSegments(content, path); + + expect(segment.path).toBe(path); + expect(segment.children[0].path).toBe(path); + }); + + test('complex example from specification', () => { + const content = ` + Process the following data + + This is **markdown** content that will be processed as pure text. + + - Item 1 + - Item 2 + + {{ VARIABLES_WILL_ALSO_SHOWN_AS_IS }} + This is a nested POML component that will be processed as POML. + + No POML processing happens here. + + Remember to check the format + + +There can be some intervening text here as well. + + +

You can add another POML segment here: {{variable_will_be_substituted}}

+
+ +

POML elements do not necessarily reside in a poml element.

`; + + const segment = createSegments(content); + + expect(segment.kind).toBe('TEXT'); + expect(segment.children).toHaveLength(5); + + const firstPomlSegment = segment.children.find(c => c.kind === 'POML' && c.tagName === 'poml'); + expect(firstPomlSegment).toBeDefined(); + expect(firstPomlSegment!.children).toHaveLength(4); + + const textSegment = firstPomlSegment!.children.find(c => c.kind === 'POML' && c.tagName === 'text'); + expect(textSegment).toBeDefined(); + expect(textSegment!.children).toHaveLength(3); + + const cpSegment = textSegment!.children.find(c => c.kind === 'POML' && c.tagName === 'cp'); + expect(cpSegment).toBeDefined(); + + const secondPomlSegment = segment.children.filter(c => c.kind === 'POML' && c.tagName === 'poml')[1]; + expect(secondPomlSegment).toBeDefined(); + + const lineBreakSegment = segment.children[3]; + expect(lineBreakSegment.kind).toBe('TEXT'); + expect(lineBreakSegment.content).toBe('\n\n'); + + const pSegment = segment.children.find(c => c.kind === 'POML' && c.tagName === 'p'); + expect(pSegment).toBeDefined(); + }); +}); \ No newline at end of file From c602a27bcb537578947995e34af3e0ff24ee9d28 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Mon, 14 Jul 2025 23:10:38 +0800 Subject: [PATCH 02/76] add tests --- packages/poml/tests/segment.test.ts | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/packages/poml/tests/segment.test.ts b/packages/poml/tests/segment.test.ts index 9c5d850e..dad79c64 100644 --- a/packages/poml/tests/segment.test.ts +++ b/packages/poml/tests/segment.test.ts @@ -86,6 +86,34 @@ Here are some key points to consider: expect(outputSegment).toBeDefined(); }); + test('text in text', () => { + const content = `This is a text with nested text content.`; + const segment = createSegments(content); + expect(segment.kind).toBe('TEXT'); + expect(segment.content).toBe(content); + expect(segment.children).toHaveLength(0); + }); + + test('text in text in POML', () => { + const content = `This is a text with nested text content.`; + const segment = createSegments(content); + expect(segment.kind).toBe('POML'); + expect(segment.tagName).toBe('poml'); + expect(segment.children).toHaveLength(1); + const textSegment = segment.children[0]; + expect(textSegment.kind).toBe('TEXT'); + expect(textSegment.content).toBe('This is a text with nested text content.'); + }); + + test('nested tag in POML', () => { + const content = `Process data with nested task content.`; + const segment = createSegments(content); + expect(segment.kind).toBe('POML'); + expect(segment.tagName).toBe('poml'); + expect(segment.children).toHaveLength(0); + expect(segment.content).toBe('Process data with nested task content.'); + }); + test('text tag with nested content', () => { const content = ` Process the following data From 6e855ba98559916f720bb4c4f066e24e1de0e444 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Mon, 14 Jul 2025 23:15:03 +0800 Subject: [PATCH 03/76] . --- packages/poml/tests/segment.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/poml/tests/segment.test.ts b/packages/poml/tests/segment.test.ts index dad79c64..45b9e082 100644 --- a/packages/poml/tests/segment.test.ts +++ b/packages/poml/tests/segment.test.ts @@ -62,7 +62,7 @@ Here are some key points to consider: }); test('nested POML segments', () => { - const content = ` + const content = ` Sample data point 1 Analysis result 1 From 9d3484b1e067aa7fb914ee5e77c2961463b714d3 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Mon, 14 Jul 2025 23:15:47 +0800 Subject: [PATCH 04/76] . --- packages/poml/tests/segment.test.ts | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/packages/poml/tests/segment.test.ts b/packages/poml/tests/segment.test.ts index 45b9e082..72a6f867 100644 --- a/packages/poml/tests/segment.test.ts +++ b/packages/poml/tests/segment.test.ts @@ -73,17 +73,8 @@ Here are some key points to consider: expect(segment.kind).toBe('POML'); expect(segment.tagName).toBe('examples'); - expect(segment.children).toHaveLength(2); - - const exampleSegment = segment.children.find(c => c.kind === 'POML' && c.tagName === 'example'); - expect(exampleSegment).toBeDefined(); - expect(exampleSegment!.children).toHaveLength(3); - - const inputSegment = exampleSegment!.children.find(c => c.kind === 'POML' && c.tagName === 'input'); - const outputSegment = exampleSegment!.children.find(c => c.kind === 'POML' && c.tagName === 'output'); - - expect(inputSegment).toBeDefined(); - expect(outputSegment).toBeDefined(); + expect(segment.children).toHaveLength(0); + expect(segment.content).toBe(content); }); test('text in text', () => { @@ -105,7 +96,7 @@ Here are some key points to consider: expect(textSegment.content).toBe('This is a text with nested text content.'); }); - test('nested tag in POML', () => { + test('nested same tag in POML', () => { const content = `Process data with nested task content.`; const segment = createSegments(content); expect(segment.kind).toBe('POML'); From 4575d3ad215558e88190eb0c45e28deb80c41820 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Tue, 15 Jul 2025 10:14:39 +0800 Subject: [PATCH 05/76] . --- docs/proposals/poml_extended.md | 103 +++++++++++++++++++++----------- 1 file changed, 69 insertions(+), 34 deletions(-) diff --git a/docs/proposals/poml_extended.md b/docs/proposals/poml_extended.md index b2a2f000..491a8949 100644 --- a/docs/proposals/poml_extended.md +++ b/docs/proposals/poml_extended.md @@ -107,53 +107,89 @@ File-level metadata can be included at any place of the file in a special `...` block spanning the whole file (in which case it will be treated as a `POML` segment). - 3. Use a stack-based algorithm to scan the text. - * When an opening tag (e.g., ``) that matches a known POML component is found, push its name and start position onto the stack. This marks the beginning of a potential `POML` segment. - * When a closing tag (e.g., ``) is found that matches the tag at the top of the stack, pop the stack. This marks a complete `POML` segment. This new segment is added as a child to the current parent segment in the tree. - * The special `` tag is handled recursively. If a `` tag is found *inside* a `POML` segment, the scanner will treat its content as a nested `TEXT` segment. This `TEXT` segment can, in turn, contain more `POML` children. - * Any content not enclosed within identified `POML` tags remains part of its parent `TEXT` segment. - 4. `` tags are treated specially. They are identified and parsed into `META` segments at any level but are logically hoisted and processed first. They should not have children. -* **Output**: A `Segment` tree. For backward compatibility, if the root segment is a single `...` block spanning the whole file, the system can revert to the original, simpler parsing model. +* **Tokenization**: Standard XML tokenization logic is used to break the input into tokens (tags, text content, attributes, etc.). -**`Segment` Interface**: The `children` property is key to representing the nested structure of mixed-content files. +* **AST Parsing Algorithm**: + 1. Scan until `<` and tag name is found. + 2. If the tag name is `text`, create a text node and scan until the corresponding `` is found (handling nested POML if present). + 3. If the tag name matches any POML tag from `componentDocs.json`, create a node with the tag name and attributes. + 4. Within POML tags, if another `text` tag is found, follow the same logic as step 2. + 5. Close the node when the corresponding closing tag `` is found. + +* **Error Tolerance**: The parser is designed to be error-tolerant, gracefully handling malformed markup while preserving as much structure as possible. + +* **Source Mapping**: The parser retains source mapping information for each AST node, enabling code intelligence features like hover, go to definition, find references, and auto completion. + +* **Output**: An AST representing the hierarchical structure of the document, where each node contains source position information and type metadata. + +**`ASTNode` Interface**: The AST nodes represent the parsed structure with source mapping. ```typescript -interface Segment { - id: string; // Unique ID for caching and React keys - kind: 'META' | 'TEXT' | 'POML'; +interface SourceRange { start: number; end: number; - content: string; // The raw string content of the segment - parent?: Segment; // Reference to the parent segment - children: Segment[]; // Nested segments (e.g., a POML block within text) - tagName?: string; // For POML segments, the name of the root tag (e.g., 'task') +} + +interface AttributeInfo { + key: string; + value: string; + keyRange: SourceRange; // Position of attribute name + valueRange: SourceRange; // Position of attribute value (excluding quotes) + fullRange: SourceRange; // Full attribute including key="value" +} + +interface ASTNode { + id: string; // Unique ID for caching and React keys + kind: 'META' | 'TEXT' | 'POML'; + start: number; // Source position start of entire node + end: number; // Source position end of entire node + content: string; // The raw string content + parent?: ASTNode; // Reference to the parent node + children: ASTNode[]; // Child nodes + + // For POML and META nodes + tagName?: string; // Tag name (e.g., 'task', 'meta') + attributes?: AttributeInfo[]; // Detailed attribute information + + // Detailed source positions + openingTag?: { + start: number; // Position of '<' + end: number; // Position after '>' + nameRange: SourceRange; // Position of tag name + }; + + closingTag?: { + start: number; // Position of '' + nameRange: SourceRange; // Position of tag name in closing tag + }; + + contentRange?: SourceRange; // Position of content between tags (excluding nested tags) + + // For TEXT nodes + textSegments?: SourceRange[]; // Multiple ranges for text content (excluding nested POML) } ``` #### II. Metadata Processing -Once the segment tree is built, all `META` segments are processed. +Once the AST is built, all `META` nodes are processed. - * **Extraction**: Traverse the tree to find all `META` segments. + * **Extraction**: Traverse the AST to find all `META` nodes. * **Population**: Parse the content of each `` tag and populate the global `PomlContext` object. - * **Removal**: After processing, `META` segments are removed from the tree to prevent them from being rendered. + * **Removal**: After processing, `META` nodes are removed from the AST to prevent them from being rendered. **`PomlContext` Interface**: This context object is the single source of truth for the entire file, passed through all readers. It's mutable, allowing stateful operations like `` to have a file-wide effect. ```typescript interface PomlContext { variables: { [key: string]: any }; // For {{ substitutions }} and (Read/Write) - texts: { [key: string]: React.ReactElement }; // Maps TEXT_ID to content for replacement (Read/Write) stylesheet: { [key: string]: string }; // Merged styles from all tags (Read-Only during render) minimalPomlVersion?: string; // From (Read-Only) sourcePath: string; // File path for resolving includes (Read-Only) @@ -162,22 +198,21 @@ interface PomlContext { #### III. Text/POML Dispatching (Recursive Rendering) -Rendering starts at the root of the segment tree and proceeds recursively. A controller dispatches segments to the appropriate reader. +Rendering starts at the root of the AST and proceeds recursively. A controller dispatches AST nodes to the appropriate reader. -* **`PureTextReader`**: Handles `TEXT` segments. +* **`PureTextReader`**: Handles `TEXT` nodes. * Currently we directly render the pure-text contents as a single React element. In future, we can: * Renders the text content, potentially using a Markdown processor. * Performs variable substitutions (`{{...}}`) using the `variables` from `PomlContext`. The logic from `handleText` in the original `PomlFile` should be extracted into a shared utility for this. - * Iterates through its `children` segments. For each child `POML` segment, it calls the `PomlReader`. + * Iterates through its `children` nodes. For each child `POML` node, it calls the `PomlReader`. -* **`PomlReader`**: Handles `POML` segments. +* **`PomlReader`**: Handles `POML` nodes. - * **Pre-processing**: Before parsing, it replaces any direct child `` regions with a self-closing placeholder tag containing a unique ID: ``. The original content of the `` segment is stored in `context.texts`. This ensures the XML parser inside `PomlFile` doesn't fail on non-XML content (like Markdown). - * **Delegation**: Instantiates a modified `PomlFile` class with the processed segment content and the shared `PomlContext`. - * **Rendering**: Calls the `pomlFile.react(context)` method to render the segment. + * **Delegation**: Instantiates a modified `PomlFile` class with the processed node content and the shared `PomlContext`. + * **Rendering**: Calls the `pomlFile.react(context)` method to render the node. -* **`IntelliSense Layer`**: The segment tree makes it easy to provide context-aware IntelliSense. By checking the `kind` of the segment at the cursor's offset, the request can be routed to the correct provider—either the `PomlReader`'s XML-aware completion logic or a simpler text/variable completion provider for `TEXT` segments. +* **`IntelliSense Layer`**: The AST makes it easy to provide context-aware IntelliSense. By checking the `kind` of the node at the cursor's offset, the request can be routed to the correct provider—either the `PomlReader`'s XML-aware completion logic or a simpler text/variable completion provider for `TEXT` nodes. **`Reader` Interface**: This interface defines the contract for both `PureTextReader` and `PomlReader`. From d201013c7e332e65e7814d44abdcf326e63ef8c4 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Tue, 15 Jul 2025 11:07:03 +0800 Subject: [PATCH 06/76] . --- .claude/settings.json | 9 +++++++++ docs/proposals/poml_extended.md | 16 ++++++++++------ 2 files changed, 19 insertions(+), 6 deletions(-) create mode 100644 .claude/settings.json diff --git a/.claude/settings.json b/.claude/settings.json new file mode 100644 index 00000000..02e05248 --- /dev/null +++ b/.claude/settings.json @@ -0,0 +1,9 @@ +{ + "permissions": { + "allow": [ + "Bash(npm run lint)", + "Bash(npm run test*)", + "Read(~/.zshrc)" + ] + } +} \ No newline at end of file diff --git a/docs/proposals/poml_extended.md b/docs/proposals/poml_extended.md index 491a8949..4f6d3e11 100644 --- a/docs/proposals/poml_extended.md +++ b/docs/proposals/poml_extended.md @@ -113,14 +113,15 @@ The core of the new architecture is a three-pass process: Tokenization and AST P This phase processes the raw file content through a standard compiling workflow: tokenization followed by parsing to an Abstract Syntax Tree (AST). -* **Tokenization**: Standard XML tokenization logic is used to break the input into tokens (tags, text content, attributes, etc.). +* **Tokenization**: Standard XML tokenization logic is used to break the input into tokens (tags, text content, attributes, etc.). Additionally, template variables in `{{}}` format are identified and tokenized as special tokens to enable proper parsing and variable substitution. * **AST Parsing Algorithm**: 1. Scan until `<` and tag name is found. - 2. If the tag name is `text`, create a text node and scan until the corresponding `` is found (handling nested POML if present). - 3. If the tag name matches any POML tag from `componentDocs.json`, create a node with the tag name and attributes. + 2. If the tag name is `text`, create a text node and scan until the corresponding `` is found (handling nested POML if present; template variables are not considered here). + 3. If the tag name matches any POML tag from `componentDocs.json`, create a node with the tag name and attributes (template variables `{{}}` in attribute values are parsed as child template nodes). 4. Within POML tags, if another `text` tag is found, follow the same logic as step 2. - 5. Close the node when the corresponding closing tag `` is found. + 5. Template variables `{{}}` found within text content or attribute values create TEMPLATE nodes as children. + 6. Close the node when the corresponding closing tag `` is found. * **Error Tolerance**: The parser is designed to be error-tolerant, gracefully handling malformed markup while preserving as much structure as possible. @@ -138,7 +139,7 @@ interface SourceRange { interface AttributeInfo { key: string; - value: string; + value: (ASTNode & { kind: 'TEXT' | 'TEMPLATE' })[]; // Mixed content: array of text/template nodes keyRange: SourceRange; // Position of attribute name valueRange: SourceRange; // Position of attribute value (excluding quotes) fullRange: SourceRange; // Full attribute including key="value" @@ -146,7 +147,7 @@ interface AttributeInfo { interface ASTNode { id: string; // Unique ID for caching and React keys - kind: 'META' | 'TEXT' | 'POML'; + kind: 'META' | 'TEXT' | 'POML' | 'TEMPLATE'; start: number; // Source position start of entire node end: number; // Source position end of entire node content: string; // The raw string content @@ -174,6 +175,9 @@ interface ASTNode { // For TEXT nodes textSegments?: SourceRange[]; // Multiple ranges for text content (excluding nested POML) + + // For TEMPLATE nodes + expression?: string; // The full expression content between {{}} } ``` From 66e7a2cc1cff8034c7d8b76ccc4767152bb69e67 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Tue, 15 Jul 2025 11:21:41 +0800 Subject: [PATCH 07/76] update to ast implementation --- packages/poml/reader/ast.ts | 543 ++++++++++++++++++++++++++++ packages/poml/tests/segment.test.ts | 353 ++++++++++-------- 2 files changed, 744 insertions(+), 152 deletions(-) create mode 100644 packages/poml/reader/ast.ts diff --git a/packages/poml/reader/ast.ts b/packages/poml/reader/ast.ts new file mode 100644 index 00000000..4ec9fb2e --- /dev/null +++ b/packages/poml/reader/ast.ts @@ -0,0 +1,543 @@ +import componentDocs from '../assets/componentDocs.json'; + +// Source position and attribute interfaces +export interface SourceRange { + start: number; + end: number; +} + +export interface AttributeInfo { + key: string; + value: (ASTNode & { kind: 'TEXT' | 'TEMPLATE' })[]; // Mixed content: array of text/template nodes + keyRange: SourceRange; // Position of attribute name + valueRange: SourceRange; // Position of attribute value (excluding quotes) + fullRange: SourceRange; // Full attribute including key="value" +} + +// Main AST node interface +export interface ASTNode { + id: string; // Unique ID for caching and React keys + kind: 'META' | 'TEXT' | 'POML' | 'TEMPLATE'; + start: number; // Source position start of entire node + end: number; // Source position end of entire node + content: string; // The raw string content + parent?: ASTNode; // Reference to the parent node + children: ASTNode[]; // Child nodes + + // For POML and META nodes + tagName?: string; // Tag name (e.g., 'task', 'meta') + attributes?: AttributeInfo[]; // Detailed attribute information + + // Detailed source positions + openingTag?: { + start: number; // Position of '<' + end: number; // Position after '>' + nameRange: SourceRange; // Position of tag name + }; + + closingTag?: { + start: number; // Position of '' + nameRange: SourceRange; // Position of tag name in closing tag + }; + + contentRange?: SourceRange; // Position of content between tags (excluding nested tags) + + // For TEXT nodes + textSegments?: SourceRange[]; // Multiple ranges for text content (excluding nested POML) + + // For TEMPLATE nodes + expression?: string; // The full expression content between {{}} +} + +// Token types for tokenization +interface Token { + type: 'TEXT' | 'TAG_OPEN' | 'TAG_CLOSE' | 'TAG_SELF_CLOSE' | 'TEMPLATE_VAR' | 'ATTRIBUTE'; + value: string; + start: number; + end: number; +} + +// Tokenizer class +class Tokenizer { + private input: string; + private position: number; + + constructor(input: string) { + this.input = input; + this.position = 0; + } + + tokenize(): Token[] { + const tokens: Token[] = []; + + while (this.position < this.input.length) { + // Check for template variables first + if (this.peek() === '{' && this.peek(1) === '{') { + tokens.push(this.readTemplateVariable()); + continue; + } + + // Check for XML tags + if (this.peek() === '<') { + const tagToken = this.readTag(); + if (tagToken) { + tokens.push(tagToken); + continue; + } + } + + // Read text content + const textToken = this.readText(); + if (textToken.value.length > 0) { + tokens.push(textToken); + } + } + + return tokens; + } + + private peek(offset: number = 0): string { + return this.input[this.position + offset] || ''; + } + + private advance(): string { + return this.input[this.position++] || ''; + } + + private readTemplateVariable(): Token { + const start = this.position; + this.advance(); // { + this.advance(); // { + + while (this.position < this.input.length && !(this.peek() === '}' && this.peek(1) === '}')) { + this.advance(); + } + + if (this.peek() === '}' && this.peek(1) === '}') { + this.advance(); // } + this.advance(); // } + } + + return { + type: 'TEMPLATE_VAR', + value: this.input.substring(start, this.position), + start, + end: this.position + }; + } + + private readTag(): Token | null { + const start = this.position; + this.advance(); // < + + // Skip whitespace + while (this.peek() === ' ' || this.peek() === '\t' || this.peek() === '\n') { + this.advance(); + } + + // Check for closing tag + const isClosing = this.peek() === '/'; + if (isClosing) { + this.advance(); + } + + // Read tag name + let tagName = ''; + while (this.position < this.input.length && + this.peek() !== '>' && + this.peek() !== ' ' && + this.peek() !== '\t' && + this.peek() !== '\n') { + tagName += this.advance(); + } + + // Skip attributes for now (will be parsed separately) + while (this.position < this.input.length && this.peek() !== '>') { + this.advance(); + } + + if (this.peek() === '>') { + this.advance(); // > + + // Check if self-closing + const content = this.input.substring(start, this.position); + const isSelfClosing = content.endsWith('/>'); + + return { + type: isSelfClosing ? 'TAG_SELF_CLOSE' : (isClosing ? 'TAG_CLOSE' : 'TAG_OPEN'), + value: content, + start, + end: this.position + }; + } + + // Invalid tag, backtrack + this.position = start + 1; + return null; + } + + private readText(): Token { + const start = this.position; + + while (this.position < this.input.length && + this.peek() !== '<' && + !(this.peek() === '{' && this.peek(1) === '{')) { + this.advance(); + } + + return { + type: 'TEXT', + value: this.input.substring(start, this.position), + start, + end: this.position + }; + } +} + +// AST Parser class +class ASTParser { + private tokens: Token[]; + private position: number; + private nextId: number; + private validPomlTags: Set; + + constructor(tokens: Token[]) { + this.tokens = tokens; + this.position = 0; + this.nextId = 0; + this.validPomlTags = this.buildValidTagsSet(); + } + + private buildValidTagsSet(): Set { + const validTags = new Set(); + + for (const doc of componentDocs) { + if (doc.name) { + validTags.add(doc.name.toLowerCase()); + // Convert camelCase to kebab-case + validTags.add(doc.name.toLowerCase().replace(/([A-Z])/g, '-$1').toLowerCase()); + } + } + + // Add special tags + validTags.add('poml'); + validTags.add('text'); + validTags.add('meta'); + + return validTags; + } + + private generateId(): string { + return `ast_${this.nextId++}`; + } + + private peek(): Token | undefined { + return this.tokens[this.position]; + } + + private advance(): Token | undefined { + return this.tokens[this.position++]; + } + + private extractTagName(tagContent: string): string { + // Remove < and > and any attributes + const content = tagContent.slice(1, -1); + const match = content.match(/^\/?\s*([a-zA-Z][\w-]*)/); + return match ? match[1] : ''; + } + + private parseAttributeValue(value: string): (ASTNode & { kind: 'TEXT' | 'TEMPLATE' })[] { + // Parse attribute value for mixed text and template variables + const result: (ASTNode & { kind: 'TEXT' | 'TEMPLATE' })[] = []; + let currentPos = 0; + + while (currentPos < value.length) { + const templateStart = value.indexOf('{{', currentPos); + + if (templateStart === -1) { + // No more template variables, add remaining text + if (currentPos < value.length) { + result.push({ + id: this.generateId(), + kind: 'TEXT', + start: currentPos, + end: value.length, + content: value.substring(currentPos), + children: [] + }); + } + break; + } + + // Add text before template variable + if (templateStart > currentPos) { + result.push({ + id: this.generateId(), + kind: 'TEXT', + start: currentPos, + end: templateStart, + content: value.substring(currentPos, templateStart), + children: [] + }); + } + + // Find end of template variable + const templateEnd = value.indexOf('}}', templateStart + 2); + if (templateEnd === -1) { + // Malformed template, treat as text + result.push({ + id: this.generateId(), + kind: 'TEXT', + start: templateStart, + end: value.length, + content: value.substring(templateStart), + children: [] + }); + break; + } + + // Add template variable + const templateContent = value.substring(templateStart + 2, templateEnd); + result.push({ + id: this.generateId(), + kind: 'TEMPLATE', + start: templateStart, + end: templateEnd + 2, + content: value.substring(templateStart, templateEnd + 2), + expression: templateContent.trim(), + children: [] + }); + + currentPos = templateEnd + 2; + } + + return result; + } + + private parseAttributes(tagContent: string): AttributeInfo[] { + const attributes: AttributeInfo[] = []; + + // Simple attribute parsing - can be enhanced later + const attrRegex = /(\w+)=["']([^"']*?)["']/g; + let match; + + while ((match = attrRegex.exec(tagContent)) !== null) { + const key = match[1]; + const value = match[2]; + const fullMatch = match[0]; + const matchStart = match.index; + + attributes.push({ + key, + value: this.parseAttributeValue(value), + keyRange: { start: matchStart, end: matchStart + key.length }, + valueRange: { start: matchStart + key.length + 2, end: matchStart + key.length + 2 + value.length }, + fullRange: { start: matchStart, end: matchStart + fullMatch.length } + }); + } + + return attributes; + } + + parse(): ASTNode { + const children = this.parseNodes(); + + if (children.length === 1 && children[0].kind === 'POML') { + return children[0]; + } + + // Create root text node + const rootNode: ASTNode = { + id: this.generateId(), + kind: 'TEXT', + start: 0, + end: this.tokens.length > 0 ? this.tokens[this.tokens.length - 1].end : 0, + content: this.tokens.map(t => t.value).join(''), + children, + textSegments: [] + }; + + // Set parent references + children.forEach(child => { + child.parent = rootNode; + }); + + return rootNode; + } + + private parseNodes(): ASTNode[] { + const nodes: ASTNode[] = []; + + while (this.position < this.tokens.length) { + const token = this.peek(); + if (!token) break; + + if (token.type === 'TEMPLATE_VAR') { + nodes.push(this.parseTemplateVariable()); + } else if (token.type === 'TAG_OPEN') { + const tagName = this.extractTagName(token.value); + + if (this.validPomlTags.has(tagName.toLowerCase())) { + const node = this.parsePomlNode(); + if (node) { + nodes.push(node); + } + } else { + // Invalid tag, treat as text + nodes.push(this.parseTextFromToken()); + } + } else if (token.type === 'TEXT') { + nodes.push(this.parseTextFromToken()); + } else { + // Skip other token types for now + this.advance(); + } + } + + return nodes; + } + + private parseTemplateVariable(): ASTNode { + const token = this.advance()!; + const expression = token.value.slice(2, -2).trim(); // Remove {{ and }} + + return { + id: this.generateId(), + kind: 'TEMPLATE', + start: token.start, + end: token.end, + content: token.value, + expression, + children: [] + }; + } + + private parseTextFromToken(): ASTNode { + const token = this.advance()!; + + return { + id: this.generateId(), + kind: 'TEXT', + start: token.start, + end: token.end, + content: token.value, + children: [], + textSegments: [{ start: token.start, end: token.end }] + }; + } + + private parsePomlNode(): ASTNode | null { + const openToken = this.advance()!; + const tagName = this.extractTagName(openToken.value); + + // Parse attributes + const attributes = this.parseAttributes(openToken.value); + + // Determine node kind + const kind = tagName.toLowerCase() === 'meta' ? 'META' : 'POML'; + + const node: ASTNode = { + id: this.generateId(), + kind, + start: openToken.start, + end: openToken.end, // Will be updated when we find closing tag + content: openToken.value, // Will be updated + tagName: tagName.toLowerCase(), + attributes, + children: [], + openingTag: { + start: openToken.start, + end: openToken.end, + nameRange: { + start: openToken.start + 1, + end: openToken.start + 1 + tagName.length + } + } + }; + + // Parse children until we find the closing tag + const children: ASTNode[] = []; + let depth = 1; + + while (this.position < this.tokens.length && depth > 0) { + const token = this.peek(); + if (!token) break; + + if (token.type === 'TAG_OPEN') { + const childTagName = this.extractTagName(token.value); + if (childTagName.toLowerCase() === tagName.toLowerCase()) { + depth++; + } + + // Special handling for text tags - don't process template variables + if (tagName.toLowerCase() === 'text') { + children.push(this.parseTextFromToken()); + } else if (this.validPomlTags.has(childTagName.toLowerCase())) { + const childNode = this.parsePomlNode(); + if (childNode) { + childNode.parent = node; + children.push(childNode); + } + } else { + children.push(this.parseTextFromToken()); + } + } else if (token.type === 'TAG_CLOSE') { + const closeTagName = this.extractTagName(token.value); + if (closeTagName.toLowerCase() === tagName.toLowerCase()) { + depth--; + if (depth === 0) { + // Found our closing tag + const closeToken = this.advance()!; + node.end = closeToken.end; + node.closingTag = { + start: closeToken.start, + end: closeToken.end, + nameRange: { + start: closeToken.start + 2, + end: closeToken.start + 2 + tagName.length + } + }; + break; + } + } + this.advance(); + } else if (token.type === 'TEMPLATE_VAR' && tagName.toLowerCase() !== 'text') { + // Only parse template variables outside of text tags + const templateNode = this.parseTemplateVariable(); + templateNode.parent = node; + children.push(templateNode); + } else { + const textNode = this.parseTextFromToken(); + textNode.parent = node; + children.push(textNode); + } + } + + node.children = children; + + // Update content to include full tag + if (node.closingTag) { + node.content = this.tokens.slice( + this.tokens.findIndex(t => t.start === node.start), + this.tokens.findIndex(t => t.end === node.end) + 1 + ).map(t => t.value).join(''); + } + + return node; + } +} + +// Main parsing function +export function parseAST(content: string): ASTNode { + const tokenizer = new Tokenizer(content); + const tokens = tokenizer.tokenize(); + const parser = new ASTParser(tokens); + return parser.parse(); +} + +export class PomlAstParser { + static parse(content: string): ASTNode { + return parseAST(content); + } +} \ No newline at end of file diff --git a/packages/poml/tests/segment.test.ts b/packages/poml/tests/segment.test.ts index 72a6f867..b23e16ef 100644 --- a/packages/poml/tests/segment.test.ts +++ b/packages/poml/tests/segment.test.ts @@ -1,27 +1,27 @@ import { describe, expect, test } from '@jest/globals'; -import { createSegments, Segment } from '../reader/segment'; +import { parseAST, ASTNode } from '../reader/ast'; -describe('createSegments', () => { +describe('parseAST', () => { test('pure text content', () => { const content = 'This is pure text content with no POML tags.'; - const segment = createSegments(content); + const ast = parseAST(content); - expect(segment.kind).toBe('TEXT'); - expect(segment.content).toBe(content); - expect(segment.start).toBe(0); - expect(segment.end).toBe(content.length); - expect(segment.children).toHaveLength(0); + expect(ast.kind).toBe('TEXT'); + expect(ast.content).toBe(content); + expect(ast.start).toBe(0); + expect(ast.end).toBe(content.length); + expect(ast.children).toHaveLength(0); }); test('single POML tag', () => { const content = 'Analyze the data'; - const segment = createSegments(content); + const ast = parseAST(content); - expect(segment.kind).toBe('POML'); - expect(segment.tagName).toBe('task'); - expect(segment.content).toBe(content); - expect(segment.start).toBe(0); - expect(segment.end).toBe(content.length); + expect(ast.kind).toBe('POML'); + expect(ast.tagName).toBe('task'); + expect(ast.content).toBe(content); + expect(ast.start).toBe(0); + expect(ast.end).toBe(content.length); }); test('mixed content with text and POML', () => { @@ -39,12 +39,12 @@ Here are some key points to consider: - Statistical significance - Business impact`; - const segment = createSegments(content); + const ast = parseAST(content); - expect(segment.kind).toBe('TEXT'); - expect(segment.children).toHaveLength(4); + expect(ast.kind).toBe('TEXT'); + expect(ast.children).toHaveLength(4); - const children = segment.children; + const children = ast.children; expect(children[0].kind).toBe('TEXT'); expect(children[0].content).toContain('# My Analysis Document'); @@ -69,40 +69,40 @@ Here are some key points to consider: `; - const segment = createSegments(content); + const ast = parseAST(content); - expect(segment.kind).toBe('POML'); - expect(segment.tagName).toBe('examples'); - expect(segment.children).toHaveLength(0); - expect(segment.content).toBe(content); + expect(ast.kind).toBe('POML'); + expect(ast.tagName).toBe('examples'); + expect(ast.children).toHaveLength(0); + expect(ast.content).toBe(content); }); test('text in text', () => { const content = `This is a text with nested text content.`; - const segment = createSegments(content); - expect(segment.kind).toBe('TEXT'); - expect(segment.content).toBe(content); - expect(segment.children).toHaveLength(0); + const ast = parseAST(content); + expect(ast.kind).toBe('TEXT'); + expect(ast.content).toBe(content); + expect(ast.children).toHaveLength(0); }); test('text in text in POML', () => { const content = `This is a text with nested text content.`; - const segment = createSegments(content); - expect(segment.kind).toBe('POML'); - expect(segment.tagName).toBe('poml'); - expect(segment.children).toHaveLength(1); - const textSegment = segment.children[0]; - expect(textSegment.kind).toBe('TEXT'); - expect(textSegment.content).toBe('This is a text with nested text content.'); + const ast = parseAST(content); + expect(ast.kind).toBe('POML'); + expect(ast.tagName).toBe('poml'); + expect(ast.children).toHaveLength(1); + const textNode = ast.children[0]; + expect(textNode.kind).toBe('TEXT'); + expect(textNode.content).toBe('This is a text with nested text content.'); }); test('nested same tag in POML', () => { const content = `Process data with nested task content.`; - const segment = createSegments(content); - expect(segment.kind).toBe('POML'); - expect(segment.tagName).toBe('poml'); - expect(segment.children).toHaveLength(0); - expect(segment.content).toBe('Process data with nested task content.'); + const ast = parseAST(content); + expect(ast.kind).toBe('POML'); + expect(ast.tagName).toBe('task'); + expect(ast.children).toHaveLength(0); + expect(ast.content).toBe('Process data with nested task content.'); }); test('text tag with nested content', () => { @@ -121,19 +121,19 @@ Here are some key points to consider: Remember to check the format `; - const segment = createSegments(content); + const ast = parseAST(content); - expect(segment.kind).toBe('POML'); - expect(segment.tagName).toBe('poml'); - expect(segment.children).toHaveLength(4); + expect(ast.kind).toBe('POML'); + expect(ast.tagName).toBe('poml'); + expect(ast.children).toHaveLength(4); - const textSegment = segment.children.find(c => c.kind === 'POML' && c.tagName === 'text'); - expect(textSegment).toBeDefined(); - expect(textSegment!.children).toHaveLength(3); + const textNode = ast.children.find(c => c.kind === 'POML' && c.tagName === 'text'); + expect(textNode).toBeDefined(); + expect(textNode!.children).toHaveLength(3); - const nestedCpSegment = textSegment!.children.find(c => c.kind === 'POML' && c.tagName === 'cp'); - expect(nestedCpSegment).toBeDefined(); - expect(nestedCpSegment!.content).toBe('This is a nested POML component that will be processed as POML.'); + const nestedCpNode = textNode!.children.find(c => c.kind === 'POML' && c.tagName === 'cp'); + expect(nestedCpNode).toBeDefined(); + expect(nestedCpNode!.content).toBe('This is a nested POML component that will be processed as POML.'); }); test('meta tags', () => { @@ -145,18 +145,18 @@ Here are some key points to consider: Complete the analysis`; - const segment = createSegments(content); + const ast = parseAST(content); - expect(segment.kind).toBe('TEXT'); - expect(segment.children).toHaveLength(3); + expect(ast.kind).toBe('TEXT'); + expect(ast.children).toHaveLength(3); - const metaSegment = segment.children.find(c => c.kind === 'META'); - expect(metaSegment).toBeDefined(); - expect(metaSegment!.tagName).toBe('meta'); - expect(metaSegment!.children).toHaveLength(0); + const metaNode = ast.children.find(c => c.kind === 'META'); + expect(metaNode).toBeDefined(); + expect(metaNode!.tagName).toBe('meta'); + expect(metaNode!.children).toHaveLength(0); - const taskSegment = segment.children.find(c => c.kind === 'POML' && c.tagName === 'task'); - expect(taskSegment).toBeDefined(); + const taskNode = ast.children.find(c => c.kind === 'POML' && c.tagName === 'task'); + expect(taskNode).toBeDefined(); }); test('invalid tags are ignored', () => { @@ -164,19 +164,19 @@ Here are some key points to consider: This should be processed This should also be ignored`; - const segment = createSegments(content); + const ast = parseAST(content); - expect(segment.kind).toBe('TEXT'); - expect(segment.children).toHaveLength(3); + expect(ast.kind).toBe('TEXT'); + expect(ast.children).toHaveLength(3); - const taskSegment = segment.children.find(c => c.kind === 'POML'); - expect(taskSegment).toBeDefined(); - expect(taskSegment!.tagName).toBe('task'); + const taskNode = ast.children.find(c => c.kind === 'POML'); + expect(taskNode).toBeDefined(); + expect(taskNode!.tagName).toBe('task'); - const textSegments = segment.children.filter(c => c.kind === 'TEXT'); - expect(textSegments).toHaveLength(2); - expect(textSegments[0].content).toContain('This should be ignored'); - expect(textSegments[1].content).toContain('This should also be ignored'); + const textNodes = ast.children.filter(c => c.kind === 'TEXT'); + expect(textNodes).toHaveLength(2); + expect(textNodes[0].content).toContain('This should be ignored'); + expect(textNodes[1].content).toContain('This should also be ignored'); }); test('self-closing tags are ignored', () => { @@ -185,15 +185,15 @@ Here are some key points to consider: Valid hint`; - const segment = createSegments(content); + const ast = parseAST(content); - expect(segment.kind).toBe('TEXT'); - expect(segment.children).toHaveLength(4); + expect(ast.kind).toBe('TEXT'); + expect(ast.children).toHaveLength(4); - const pomlSegments = segment.children.filter(c => c.kind === 'POML'); - expect(pomlSegments).toHaveLength(3); - expect(pomlSegments[0].tagName).toBe('task'); - expect(pomlSegments[2].tagName).toBe('hint'); + const pomlNodes = ast.children.filter(c => c.kind === 'POML'); + expect(pomlNodes).toHaveLength(3); + expect(pomlNodes[0].tagName).toBe('task'); + expect(pomlNodes[2].tagName).toBe('hint'); }); test('malformed tags are handled gracefully', () => { @@ -201,42 +201,45 @@ Here are some key points to consider: Complete hint This has no closing tag`; - const segment = createSegments(content); + const ast = parseAST(content); - expect(segment.kind).toBe('TEXT'); - expect(segment.children).toHaveLength(3); + expect(ast.kind).toBe('TEXT'); + expect(ast.children).toHaveLength(3); - const hintSegment = segment.children.find(c => c.kind === 'POML' && c.tagName === 'hint'); - expect(hintSegment).toBeDefined(); - expect(hintSegment!.content).toBe('Complete hint'); + const hintNode = ast.children.find(c => c.kind === 'POML' && c.tagName === 'hint'); + expect(hintNode).toBeDefined(); + expect(hintNode!.content).toBe('Complete hint'); - const textSegments = segment.children.filter(c => c.kind === 'TEXT'); - expect(textSegments).toHaveLength(2); - expect(textSegments[0].content).toBe('Incomplete tag\n'); - expect(textSegments[1].content).toBe('\nThis has no closing tag'); + const textNodes = ast.children.filter(c => c.kind === 'TEXT'); + expect(textNodes).toHaveLength(2); + expect(textNodes[0].content).toBe('Incomplete tag\n'); + expect(textNodes[1].content).toBe('\nThis has no closing tag'); }); test('malformed POML tags are ignored', () => { const content = `Valid task`; - const segment = createSegments(content); + const ast = parseAST(content); - expect(segment.kind).toBe('TEXT'); - expect(segment.children).toHaveLength(0); + expect(ast.kind).toBe('TEXT'); + expect(ast.children).toHaveLength(0); }); test('empty content', () => { const content = ''; - const segment = createSegments(content); + const ast = parseAST(content); + expect(ast.kind).toBe('TEXT'); + expect(ast.content).toBe(''); + expect(ast.children).toHaveLength(0); }); test('whitespace-only content', () => { const content = ' \n\n\t \n '; - const segment = createSegments(content); + const ast = parseAST(content); - expect(segment.kind).toBe('TEXT'); - expect(segment.content).toBe(content); - expect(segment.children).toHaveLength(0); + expect(ast.kind).toBe('TEXT'); + expect(ast.content).toBe(content); + expect(ast.children).toHaveLength(0); }); test('hyphenated tag names', () => { @@ -244,16 +247,16 @@ Here are some key points to consider: System message User message`; - const segment = createSegments(content); + const ast = parseAST(content); - expect(segment.kind).toBe('TEXT'); - expect(segment.children).toHaveLength(4); + expect(ast.kind).toBe('TEXT'); + expect(ast.children).toHaveLength(4); - const pomlSegments = segment.children.filter(c => c.kind === 'POML'); - expect(pomlSegments).toHaveLength(3); - expect(pomlSegments[0].tagName).toBe('output-format'); - expect(pomlSegments[1].tagName).toBe('system-msg'); - expect(pomlSegments[2].tagName).toBe('user-msg'); + const pomlNodes = ast.children.filter(c => c.kind === 'POML'); + expect(pomlNodes).toHaveLength(3); + expect(pomlNodes[0].tagName).toBe('output-format'); + expect(pomlNodes[1].tagName).toBe('system-msg'); + expect(pomlNodes[2].tagName).toBe('user-msg'); }); test('parent-child relationships', () => { @@ -265,59 +268,50 @@ Here are some key points to consider: `; - const segment = createSegments(content); + const ast = parseAST(content); - const taskSegment = segment; - expect(taskSegment.kind).toBe('POML'); - expect(taskSegment.tagName).toBe('task'); - expect(taskSegment.parent).toBeUndefined(); + const taskNode = ast; + expect(taskNode.kind).toBe('POML'); + expect(taskNode.tagName).toBe('task'); + expect(taskNode.parent).toBeUndefined(); - const hintSegment = taskSegment.children.find(c => c.kind === 'POML' && c.tagName === 'hint'); - expect(hintSegment).toBeDefined(); - expect(hintSegment!.parent).toBe(taskSegment); + const hintNode = taskNode.children.find(c => c.kind === 'POML' && c.tagName === 'hint'); + expect(hintNode).toBeDefined(); + expect(hintNode!.parent).toBe(taskNode); - const examplesSegment = taskSegment.children.find(c => c.kind === 'POML' && c.tagName === 'examples'); - expect(examplesSegment).toBeDefined(); - expect(examplesSegment!.parent).toBe(taskSegment); + const examplesNode = taskNode.children.find(c => c.kind === 'POML' && c.tagName === 'examples'); + expect(examplesNode).toBeDefined(); + expect(examplesNode!.parent).toBe(taskNode); - const exampleSegment = examplesSegment!.children.find(c => c.kind === 'POML' && c.tagName === 'example'); - expect(exampleSegment).toBeDefined(); - expect(exampleSegment!.parent).toBe(examplesSegment); + const exampleNode = examplesNode!.children.find(c => c.kind === 'POML' && c.tagName === 'example'); + expect(exampleNode).toBeDefined(); + expect(exampleNode!.parent).toBe(examplesNode); }); - test('segment IDs are unique', () => { + test('node IDs are unique', () => { const content = `First task Second task A hint`; - const segment = createSegments(content); - expect(segment.kind).toBe('TEXT'); - expect(segment.children).toHaveLength(5); + const ast = parseAST(content); + expect(ast.kind).toBe('TEXT'); + expect(ast.children).toHaveLength(5); - function collectAllSegments(segment: Segment): Segment[] { - const all = [segment]; - segment.children.forEach(child => { - all.push(...collectAllSegments(child)); + function collectAllNodes(node: ASTNode): ASTNode[] { + const all = [node]; + node.children.forEach(child => { + all.push(...collectAllNodes(child)); }); return all; } - const allSegments = collectAllSegments(segment); - const ids = allSegments.map(s => s.id); + const allNodes = collectAllNodes(ast); + const ids = allNodes.map(s => s.id); const uniqueIds = new Set(ids); expect(uniqueIds.size).toBe(ids.length); }); - test('path parameter is preserved', () => { - const content = 'Test task'; - const path = '/test/path/file.poml'; - const segment = createSegments(content, path); - - expect(segment.path).toBe(path); - expect(segment.children[0].path).toBe(path); - }); - test('complex example from specification', () => { const content = ` Process the following data @@ -343,30 +337,85 @@ There can be some intervening text here as well.

POML elements do not necessarily reside in a poml element.

`; - const segment = createSegments(content); + const ast = parseAST(content); + + expect(ast.kind).toBe('TEXT'); + expect(ast.children).toHaveLength(5); - expect(segment.kind).toBe('TEXT'); - expect(segment.children).toHaveLength(5); + const firstPomlNode = ast.children.find(c => c.kind === 'POML' && c.tagName === 'poml'); + expect(firstPomlNode).toBeDefined(); + expect(firstPomlNode!.children).toHaveLength(4); - const firstPomlSegment = segment.children.find(c => c.kind === 'POML' && c.tagName === 'poml'); - expect(firstPomlSegment).toBeDefined(); - expect(firstPomlSegment!.children).toHaveLength(4); + const textNode = firstPomlNode!.children.find(c => c.kind === 'POML' && c.tagName === 'text'); + expect(textNode).toBeDefined(); + expect(textNode!.children).toHaveLength(3); + + const cpNode = textNode!.children.find(c => c.kind === 'POML' && c.tagName === 'cp'); + expect(cpNode).toBeDefined(); + + const secondPomlNode = ast.children.filter(c => c.kind === 'POML' && c.tagName === 'poml')[1]; + expect(secondPomlNode).toBeDefined(); + + const lineBreakNode = ast.children[3]; + expect(lineBreakNode.kind).toBe('TEXT'); + expect(lineBreakNode.content).toBe('\n\n'); + + const pNode = ast.children.find(c => c.kind === 'POML' && c.tagName === 'p'); + expect(pNode).toBeDefined(); + }); + + test('template variables in content', () => { + const content = `Process {{variable}} with {{another_variable}}`; + const ast = parseAST(content); - const textSegment = firstPomlSegment!.children.find(c => c.kind === 'POML' && c.tagName === 'text'); - expect(textSegment).toBeDefined(); - expect(textSegment!.children).toHaveLength(3); + expect(ast.kind).toBe('POML'); + expect(ast.tagName).toBe('task'); + expect(ast.children).toHaveLength(4); // text, template, text, template - const cpSegment = textSegment!.children.find(c => c.kind === 'POML' && c.tagName === 'cp'); - expect(cpSegment).toBeDefined(); + const templateNodes = ast.children.filter(c => c.kind === 'TEMPLATE'); + expect(templateNodes).toHaveLength(2); + expect(templateNodes[0].expression).toBe('variable'); + expect(templateNodes[1].expression).toBe('another_variable'); + }); + + test('template variables in text nodes are treated as literal', () => { + const content = `Variables like {{this}} are shown as-is`; + const ast = parseAST(content); - const secondPomlSegment = segment.children.filter(c => c.kind === 'POML' && c.tagName === 'poml')[1]; - expect(secondPomlSegment).toBeDefined(); + expect(ast.kind).toBe('TEXT'); + expect(ast.content).toBe(content); + expect(ast.children).toHaveLength(0); + }); - const lineBreakSegment = segment.children[3]; - expect(lineBreakSegment.kind).toBe('TEXT'); - expect(lineBreakSegment.content).toBe('\n\n'); + test('template variables in attribute values', () => { + const content = `Content`; + const ast = parseAST(content); + + expect(ast.kind).toBe('POML'); + expect(ast.tagName).toBe('task'); + expect(ast.attributes).toHaveLength(1); + + const attr = ast.attributes![0]; + expect(attr.key).toBe('caption'); + expect(attr.value).toHaveLength(2); // text + template + expect(attr.value[0].kind).toBe('TEXT'); + expect(attr.value[0].content).toBe('Process '); + expect(attr.value[1].kind).toBe('TEMPLATE'); + expect(attr.value[1].expression).toBe('variable'); + }); - const pSegment = segment.children.find(c => c.kind === 'POML' && c.tagName === 'p'); - expect(pSegment).toBeDefined(); + test('mixed template variables and text in attributes', () => { + const content = `Content`; + const ast = parseAST(content); + + expect(ast.kind).toBe('POML'); + expect(ast.attributes).toHaveLength(1); + + const attr = ast.attributes![0]; + expect(attr.value).toHaveLength(4); // text, template, text, template + expect(attr.value[0].content).toBe('Hello '); + expect(attr.value[1].expression).toBe('name'); + expect(attr.value[2].content).toBe(', process '); + expect(attr.value[3].expression).toBe('data'); }); }); \ No newline at end of file From dffbc805b7760c4b0263bd3400549459a13dde6c Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Tue, 15 Jul 2025 11:36:17 +0800 Subject: [PATCH 08/76] . --- packages/poml/reader/ast.ts | 145 +----------------- packages/poml/reader/tokenizer.ts | 142 +++++++++++++++++ .../{segment.test.ts => reader/ast.test.ts} | 2 +- 3 files changed, 144 insertions(+), 145 deletions(-) create mode 100644 packages/poml/reader/tokenizer.ts rename packages/poml/tests/{segment.test.ts => reader/ast.test.ts} (99%) diff --git a/packages/poml/reader/ast.ts b/packages/poml/reader/ast.ts index 4ec9fb2e..3b758e1f 100644 --- a/packages/poml/reader/ast.ts +++ b/packages/poml/reader/ast.ts @@ -1,3 +1,4 @@ +import { Tokenizer, Token } from './tokenizer'; import componentDocs from '../assets/componentDocs.json'; // Source position and attribute interfaces @@ -50,150 +51,6 @@ export interface ASTNode { expression?: string; // The full expression content between {{}} } -// Token types for tokenization -interface Token { - type: 'TEXT' | 'TAG_OPEN' | 'TAG_CLOSE' | 'TAG_SELF_CLOSE' | 'TEMPLATE_VAR' | 'ATTRIBUTE'; - value: string; - start: number; - end: number; -} - -// Tokenizer class -class Tokenizer { - private input: string; - private position: number; - - constructor(input: string) { - this.input = input; - this.position = 0; - } - - tokenize(): Token[] { - const tokens: Token[] = []; - - while (this.position < this.input.length) { - // Check for template variables first - if (this.peek() === '{' && this.peek(1) === '{') { - tokens.push(this.readTemplateVariable()); - continue; - } - - // Check for XML tags - if (this.peek() === '<') { - const tagToken = this.readTag(); - if (tagToken) { - tokens.push(tagToken); - continue; - } - } - - // Read text content - const textToken = this.readText(); - if (textToken.value.length > 0) { - tokens.push(textToken); - } - } - - return tokens; - } - - private peek(offset: number = 0): string { - return this.input[this.position + offset] || ''; - } - - private advance(): string { - return this.input[this.position++] || ''; - } - - private readTemplateVariable(): Token { - const start = this.position; - this.advance(); // { - this.advance(); // { - - while (this.position < this.input.length && !(this.peek() === '}' && this.peek(1) === '}')) { - this.advance(); - } - - if (this.peek() === '}' && this.peek(1) === '}') { - this.advance(); // } - this.advance(); // } - } - - return { - type: 'TEMPLATE_VAR', - value: this.input.substring(start, this.position), - start, - end: this.position - }; - } - - private readTag(): Token | null { - const start = this.position; - this.advance(); // < - - // Skip whitespace - while (this.peek() === ' ' || this.peek() === '\t' || this.peek() === '\n') { - this.advance(); - } - - // Check for closing tag - const isClosing = this.peek() === '/'; - if (isClosing) { - this.advance(); - } - - // Read tag name - let tagName = ''; - while (this.position < this.input.length && - this.peek() !== '>' && - this.peek() !== ' ' && - this.peek() !== '\t' && - this.peek() !== '\n') { - tagName += this.advance(); - } - - // Skip attributes for now (will be parsed separately) - while (this.position < this.input.length && this.peek() !== '>') { - this.advance(); - } - - if (this.peek() === '>') { - this.advance(); // > - - // Check if self-closing - const content = this.input.substring(start, this.position); - const isSelfClosing = content.endsWith('/>'); - - return { - type: isSelfClosing ? 'TAG_SELF_CLOSE' : (isClosing ? 'TAG_CLOSE' : 'TAG_OPEN'), - value: content, - start, - end: this.position - }; - } - - // Invalid tag, backtrack - this.position = start + 1; - return null; - } - - private readText(): Token { - const start = this.position; - - while (this.position < this.input.length && - this.peek() !== '<' && - !(this.peek() === '{' && this.peek(1) === '{')) { - this.advance(); - } - - return { - type: 'TEXT', - value: this.input.substring(start, this.position), - start, - end: this.position - }; - } -} // AST Parser class class ASTParser { diff --git a/packages/poml/reader/tokenizer.ts b/packages/poml/reader/tokenizer.ts new file mode 100644 index 00000000..a8e166d1 --- /dev/null +++ b/packages/poml/reader/tokenizer.ts @@ -0,0 +1,142 @@ +export interface Token { + type: 'TEXT' | 'TAG_OPEN' | 'TAG_CLOSE' | 'TAG_SELF_CLOSE' | 'TEMPLATE_VAR' | 'ATTRIBUTE'; + value: string; + start: number; + end: number; +} + +export class Tokenizer { + private input: string; + private position: number; + + constructor(input: string) { + this.input = input; + this.position = 0; + } + + tokenize(): Token[] { + const tokens: Token[] = []; + + while (this.position < this.input.length) { + // Check for template variables first + if (this.peek() === '{' && this.peek(1) === '{') { + tokens.push(this.readTemplateVariable()); + continue; + } + + // Check for XML tags + if (this.peek() === '<') { + const tagToken = this.readTag(); + if (tagToken) { + tokens.push(tagToken); + continue; + } + } + + // Read text content + const textToken = this.readText(); + if (textToken.value.length > 0) { + tokens.push(textToken); + } + } + + return tokens; + } + + private peek(offset: number = 0): string { + return this.input[this.position + offset] || ''; + } + + private advance(): string { + return this.input[this.position++] || ''; + } + + private readTemplateVariable(): Token { + const start = this.position; + this.advance(); // { + this.advance(); // { + + while (this.position < this.input.length && !(this.peek() === '}' && this.peek(1) === '}')) { + this.advance(); + } + + if (this.peek() === '}' && this.peek(1) === '}') { + this.advance(); // } + this.advance(); // } + } + + return { + type: 'TEMPLATE_VAR', + value: this.input.substring(start, this.position), + start, + end: this.position + }; + } + + private readTag(): Token | null { + const start = this.position; + this.advance(); // < + + // Skip whitespace + while (this.peek() === ' ' || this.peek() === '\t' || this.peek() === '\n') { + this.advance(); + } + + // Check for closing tag + const isClosing = this.peek() === '/'; + if (isClosing) { + this.advance(); + } + + // Read tag name + let tagName = ''; + while (this.position < this.input.length && + this.peek() !== '>' && + this.peek() !== ' ' && + this.peek() !== '\t' && + this.peek() !== '\n') { + tagName += this.advance(); + } + + // Skip attributes for now (will be parsed separately) + while (this.position < this.input.length && this.peek() !== '>') { + this.advance(); + } + + if (this.peek() === '>') { + this.advance(); // > + + // Check if self-closing + const content = this.input.substring(start, this.position); + const isSelfClosing = content.endsWith('/>'); + + return { + type: isSelfClosing ? 'TAG_SELF_CLOSE' : (isClosing ? 'TAG_CLOSE' : 'TAG_OPEN'), + value: content, + start, + end: this.position + }; + } + + // Invalid tag, backtrack + this.position = start + 1; + return null; + } + + private readText(): Token { + const start = this.position; + + while (this.position < this.input.length && + this.peek() !== '<' && + !(this.peek() === '{' && this.peek(1) === '{')) { + this.advance(); + } + + return { + type: 'TEXT', + value: this.input.substring(start, this.position), + start, + end: this.position + }; + } +} diff --git a/packages/poml/tests/segment.test.ts b/packages/poml/tests/reader/ast.test.ts similarity index 99% rename from packages/poml/tests/segment.test.ts rename to packages/poml/tests/reader/ast.test.ts index b23e16ef..9921e210 100644 --- a/packages/poml/tests/segment.test.ts +++ b/packages/poml/tests/reader/ast.test.ts @@ -1,5 +1,5 @@ import { describe, expect, test } from '@jest/globals'; -import { parseAST, ASTNode } from '../reader/ast'; +import { parseAST, ASTNode } from 'poml/reader/ast'; describe('parseAST', () => { test('pure text content', () => { From 5fe7dd3f424ee3b043c3ba0229ba2755ad728ce3 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Tue, 15 Jul 2025 14:23:26 +0800 Subject: [PATCH 09/76] Update poml_extended.md --- docs/proposals/poml_extended.md | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/docs/proposals/poml_extended.md b/docs/proposals/poml_extended.md index 4f6d3e11..8564b681 100644 --- a/docs/proposals/poml_extended.md +++ b/docs/proposals/poml_extended.md @@ -17,7 +17,7 @@ The current POML implementation requires files to be fully enclosed within ` +4. **Controlled Evolution of Tags**: behaviour of new/experimental tags is opt‑in via ``, preventing accidental breakage when upgrading the tool‑chain. ## File Format Specification @@ -36,6 +36,7 @@ The system will assume the whole file is a pure text file and detects certain pa 1. Loading component definitions from `componentDocs.json` and extracting valid POML component names and their aliases. 2. Scanning for opening tags that match these components, and scanning until the corresponding closing tag is found. 3. If a special tag `...` is found within a POML segment, it will be treated as pure text content and processed following the rules above (step 1 and 2). +4. Unknown or disabled tags are treated as literal text and, by default, raise a diagnostic warning. An example is shown below: @@ -103,6 +104,14 @@ There can be some intervening text here as well. Metadatas are information that is useful when parsing and rendering the file, such as context variables, stylesheets, version information, file paths, etc. File-level metadata can be included at any place of the file in a special `` tag. This metadata will be processed before any content parsing. +**Example:** + +```xml + + +``` + ## Architecture Design ### High-level Processing Pipeline From 4a970a53d497a8cb08a79713caa19c79d0da2bc3 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Tue, 15 Jul 2025 17:58:00 +0800 Subject: [PATCH 10/76] Add cst --- packages/poml/reader/cst.ts | 233 ++++++++++++++++++++++++++++++++++++ 1 file changed, 233 insertions(+) create mode 100644 packages/poml/reader/cst.ts diff --git a/packages/poml/reader/cst.ts b/packages/poml/reader/cst.ts new file mode 100644 index 00000000..43265cd9 --- /dev/null +++ b/packages/poml/reader/cst.ts @@ -0,0 +1,233 @@ +/* + Extended‑POML Lexer & CST Parser (Chevrotain) + ------------------------------------------------ + • Implements a two‑phase scanning strategy (lex + parse) for the mixed‑content + POML format described in the design spec. + • Produces a Concrete‑Syntax‑Tree (CST) that preserves the complete source + structure – suitable for later AST conversion, code‑intel, or pretty‑printing. + + Author: ChatGPT (o3) · Jul 15 2025 +*/ + +import { + createToken, + Lexer, + CstParser, + IToken, + CstNode, + tokenMatcher, + EmbeddedActionsParser +} from "chevrotain"; + +/*───────────────────────────────────────────────────────────────────────────┐ +│ 1. Token Definitions │ +└───────────────────────────────────────────────────────────────────────────*/ +// Helpers ----------------------------------------------------------------- +const makeRegexSafe = (re: RegExp) => { + return new RegExp(re.source, re.flags); +}; + +/** Matches valid XML / POML element or attribute names. */ +const nameRegex = /[A-Za-z_](?:[A-Za-z0-9_.-]*)/; +/** Rejects names that start with "xml" (case‑insensitive). */ +function validName(text: string) { + return !/^xml/i.test(text); +} + +/* Longest tokens first – Chevrotain uses sequential matching order. + Pay attention to shared prefixes like " (greedy, including line‑breaks) +export const Comment = createToken({ + name: "Comment", + pattern: //, + line_breaks: true +}); + +// Template delimiters {{ ... }} ------------------------------------------- +export const TmplStart = createToken({ name: "TmplStart", pattern: /{{/ }); +export const TmplEnd = createToken({ name: "TmplEnd", pattern: /}}/ }); +export const TmplBody = createToken({ + name: "TmplBody", + pattern: /[^{}]+/, + // will be pushed onto the stack between {{ ... }} + line_breaks: true +}); + +// Tag delimiters ----------------------------------------------------------- +export const CloseTagStart = createToken({ + name: "CloseTagStart", + pattern: /<\// +}); +export const SelfClose = createToken({ name: "SelfClose", pattern: /\/>/ }); +export const OpenTagStart = createToken({ name: "OpenTagStart", pattern: // }); + +// Misc tokens -------------------------------------------------------------- +export const Equals = createToken({ name: "Equals", pattern: /=/ }); +export const Quote = createToken({ name: "Quote", pattern: /"/ }); + +// Identifiers (tag & attribute names) ------------------------------------- +export const Identifier = createToken({ + name: "Identifier", + pattern: nameRegex, + line_breaks: false, + longer_alt: undefined, + // custom validator to reject names starting with XML + // Chevrotain v11 supports "validate" callback – fallback to pattern check +}); + +// Attribute value – everything inside double quotes (lazy, allows linebreaks) +export const AttrText = createToken({ + name: "AttrText", + pattern: /[^\"]+/, + line_breaks: true +}); + +// Raw text between tags – stop at the first "<" or "{{" +export const RawText = createToken({ + name: "RawText", + pattern: /[^<{]+/, + line_breaks: true +}); + +// Whitespace (skipped) +export const WS = createToken({ + name: "WS", + pattern: /[ \t\r\n]+/, + group: Lexer.SKIPPED +}); + +export const allTokens = [ + // order matters! + Comment, + TmplStart, + TmplEnd, + CloseTagStart, + SelfClose, + OpenTagStart, + GT, + Equals, + Quote, + Identifier, + TmplBody, // must come after Identifier so {{name}} splits correctly + AttrText, + RawText, + WS +]; + +export const PomlLexer = new Lexer(allTokens, { + positionTracking: "full" +}); + +/*───────────────────────────────────────────────────────────────────────────┐ +│ 2. CST Parser │ +└───────────────────────────────────────────────────────────────────────────*/ +class PomlCstParser extends CstParser { + constructor() { + super(allTokens, { recoveryEnabled: true }); + + const $ = this; + + $.RULE("document", () => { + $.MANY(() => { + $.SUBRULE($.content); + }); + }); + + $.RULE("content", () => { + $.OR([ + { ALT: () => $.SUBRULE($.element) }, + { ALT: () => $.SUBRULE($.template) }, + { ALT: () => $.CONSUME(RawText) }, + { ALT: () => $.CONSUME(Comment) } + ]); + }); + + // content* | + $.RULE("element", () => { + $.CONSUME(OpenTagStart); + const nameToken = $.CONSUME(Identifier); + + $.MANY(() => { + $.SUBRULE($.attribute); + }); + + $.OR([ + { ALT: () => { + $.CONSUME(SelfClose); + } + }, + { ALT: () => { + $.CONSUME(GT); + $.MANY2(() => { + $.SUBRULE2($.content); + }); + $.CONSUME(CloseTagStart); + $.CONSUME2(Identifier, { LABEL: "closingName" }); + $.CONSUME2(GT); + } + } + ]); + }); + + // attrName = "value" + $.RULE("attribute", () => { + $.CONSUME(Identifier, { LABEL: "attrName" }); + $.CONSUME(Equals); + $.CONSUME(Quote); + $.MANY(() => { + $.OR([ + { ALT: () => $.SUBRULE($.template) }, + { ALT: () => $.CONSUME(AttrText) } + ]); + }); + $.CONSUME2(Quote); + }); + + // {{ expression }} + $.RULE("template", () => { + $.CONSUME(TmplStart); + $.CONSUME(TmplBody, { LABEL: "expr" }); + $.CONSUME(TmplEnd); + }); + + this.performSelfAnalysis(); + } +} + +export const parser = new PomlCstParser(); + +/*───────────────────────────────────────────────────────────────────────────┐ +│ 3. Convenience API │ +└───────────────────────────────────────────────────────────────────────────*/ +export interface ParseResult { + cst: CstNode | undefined; + lexErrors: any[]; + parseErrors: any[]; +} + +/** + * Parses a given POML / mixed‑content string and returns the CST & diagnostics. + */ +export function parsePoml(input: string): ParseResult { + const lexResult = PomlLexer.tokenize(input); + parser.input = lexResult.tokens; + const cst = parser.document(); + + return { + cst, + lexErrors: lexResult.errors, + parseErrors: parser.errors + }; +} + +/*───────────────────────────────────────────────────────────────────────────┐ +│ 4. Quick demo │ +└───────────────────────────────────────────────────────────────────────────*/ +if (require.main === module) { + const sample = `\n# Hello\nDo something\n`; // eslint‑disable‑line no-console + const result = parsePoml(sample); + console.log("Lex errors:", result.lexErrors); + console.log("Parse errors:", result.parseErrors); + console.dir(result.cst, { depth: 10, colors: true }); +} From f77b2e052dce6bca013d542abaef4191d523dc73 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Wed, 16 Jul 2025 20:24:32 +0800 Subject: [PATCH 11/76] Add lexer --- packages/poml/reader/cst.ts | 233 --------- packages/poml/reader/lexer.ts | 92 ++++ packages/poml/tests/reader/lexer.test.ts | 633 +++++++++++++++++++++++ 3 files changed, 725 insertions(+), 233 deletions(-) create mode 100644 packages/poml/reader/lexer.ts create mode 100644 packages/poml/tests/reader/lexer.test.ts diff --git a/packages/poml/reader/cst.ts b/packages/poml/reader/cst.ts index 43265cd9..e69de29b 100644 --- a/packages/poml/reader/cst.ts +++ b/packages/poml/reader/cst.ts @@ -1,233 +0,0 @@ -/* - Extended‑POML Lexer & CST Parser (Chevrotain) - ------------------------------------------------ - • Implements a two‑phase scanning strategy (lex + parse) for the mixed‑content - POML format described in the design spec. - • Produces a Concrete‑Syntax‑Tree (CST) that preserves the complete source - structure – suitable for later AST conversion, code‑intel, or pretty‑printing. - - Author: ChatGPT (o3) · Jul 15 2025 -*/ - -import { - createToken, - Lexer, - CstParser, - IToken, - CstNode, - tokenMatcher, - EmbeddedActionsParser -} from "chevrotain"; - -/*───────────────────────────────────────────────────────────────────────────┐ -│ 1. Token Definitions │ -└───────────────────────────────────────────────────────────────────────────*/ -// Helpers ----------------------------------------------------------------- -const makeRegexSafe = (re: RegExp) => { - return new RegExp(re.source, re.flags); -}; - -/** Matches valid XML / POML element or attribute names. */ -const nameRegex = /[A-Za-z_](?:[A-Za-z0-9_.-]*)/; -/** Rejects names that start with "xml" (case‑insensitive). */ -function validName(text: string) { - return !/^xml/i.test(text); -} - -/* Longest tokens first – Chevrotain uses sequential matching order. - Pay attention to shared prefixes like " (greedy, including line‑breaks) -export const Comment = createToken({ - name: "Comment", - pattern: //, - line_breaks: true -}); - -// Template delimiters {{ ... }} ------------------------------------------- -export const TmplStart = createToken({ name: "TmplStart", pattern: /{{/ }); -export const TmplEnd = createToken({ name: "TmplEnd", pattern: /}}/ }); -export const TmplBody = createToken({ - name: "TmplBody", - pattern: /[^{}]+/, - // will be pushed onto the stack between {{ ... }} - line_breaks: true -}); - -// Tag delimiters ----------------------------------------------------------- -export const CloseTagStart = createToken({ - name: "CloseTagStart", - pattern: /<\// -}); -export const SelfClose = createToken({ name: "SelfClose", pattern: /\/>/ }); -export const OpenTagStart = createToken({ name: "OpenTagStart", pattern: // }); - -// Misc tokens -------------------------------------------------------------- -export const Equals = createToken({ name: "Equals", pattern: /=/ }); -export const Quote = createToken({ name: "Quote", pattern: /"/ }); - -// Identifiers (tag & attribute names) ------------------------------------- -export const Identifier = createToken({ - name: "Identifier", - pattern: nameRegex, - line_breaks: false, - longer_alt: undefined, - // custom validator to reject names starting with XML - // Chevrotain v11 supports "validate" callback – fallback to pattern check -}); - -// Attribute value – everything inside double quotes (lazy, allows linebreaks) -export const AttrText = createToken({ - name: "AttrText", - pattern: /[^\"]+/, - line_breaks: true -}); - -// Raw text between tags – stop at the first "<" or "{{" -export const RawText = createToken({ - name: "RawText", - pattern: /[^<{]+/, - line_breaks: true -}); - -// Whitespace (skipped) -export const WS = createToken({ - name: "WS", - pattern: /[ \t\r\n]+/, - group: Lexer.SKIPPED -}); - -export const allTokens = [ - // order matters! - Comment, - TmplStart, - TmplEnd, - CloseTagStart, - SelfClose, - OpenTagStart, - GT, - Equals, - Quote, - Identifier, - TmplBody, // must come after Identifier so {{name}} splits correctly - AttrText, - RawText, - WS -]; - -export const PomlLexer = new Lexer(allTokens, { - positionTracking: "full" -}); - -/*───────────────────────────────────────────────────────────────────────────┐ -│ 2. CST Parser │ -└───────────────────────────────────────────────────────────────────────────*/ -class PomlCstParser extends CstParser { - constructor() { - super(allTokens, { recoveryEnabled: true }); - - const $ = this; - - $.RULE("document", () => { - $.MANY(() => { - $.SUBRULE($.content); - }); - }); - - $.RULE("content", () => { - $.OR([ - { ALT: () => $.SUBRULE($.element) }, - { ALT: () => $.SUBRULE($.template) }, - { ALT: () => $.CONSUME(RawText) }, - { ALT: () => $.CONSUME(Comment) } - ]); - }); - - // content* | - $.RULE("element", () => { - $.CONSUME(OpenTagStart); - const nameToken = $.CONSUME(Identifier); - - $.MANY(() => { - $.SUBRULE($.attribute); - }); - - $.OR([ - { ALT: () => { - $.CONSUME(SelfClose); - } - }, - { ALT: () => { - $.CONSUME(GT); - $.MANY2(() => { - $.SUBRULE2($.content); - }); - $.CONSUME(CloseTagStart); - $.CONSUME2(Identifier, { LABEL: "closingName" }); - $.CONSUME2(GT); - } - } - ]); - }); - - // attrName = "value" - $.RULE("attribute", () => { - $.CONSUME(Identifier, { LABEL: "attrName" }); - $.CONSUME(Equals); - $.CONSUME(Quote); - $.MANY(() => { - $.OR([ - { ALT: () => $.SUBRULE($.template) }, - { ALT: () => $.CONSUME(AttrText) } - ]); - }); - $.CONSUME2(Quote); - }); - - // {{ expression }} - $.RULE("template", () => { - $.CONSUME(TmplStart); - $.CONSUME(TmplBody, { LABEL: "expr" }); - $.CONSUME(TmplEnd); - }); - - this.performSelfAnalysis(); - } -} - -export const parser = new PomlCstParser(); - -/*───────────────────────────────────────────────────────────────────────────┐ -│ 3. Convenience API │ -└───────────────────────────────────────────────────────────────────────────*/ -export interface ParseResult { - cst: CstNode | undefined; - lexErrors: any[]; - parseErrors: any[]; -} - -/** - * Parses a given POML / mixed‑content string and returns the CST & diagnostics. - */ -export function parsePoml(input: string): ParseResult { - const lexResult = PomlLexer.tokenize(input); - parser.input = lexResult.tokens; - const cst = parser.document(); - - return { - cst, - lexErrors: lexResult.errors, - parseErrors: parser.errors - }; -} - -/*───────────────────────────────────────────────────────────────────────────┐ -│ 4. Quick demo │ -└───────────────────────────────────────────────────────────────────────────*/ -if (require.main === module) { - const sample = `\n# Hello\nDo something\n`; // eslint‑disable‑line no-console - const result = parsePoml(sample); - console.log("Lex errors:", result.lexErrors); - console.log("Parse errors:", result.parseErrors); - console.dir(result.cst, { depth: 10, colors: true }); -} diff --git a/packages/poml/reader/lexer.ts b/packages/poml/reader/lexer.ts new file mode 100644 index 00000000..292647ed --- /dev/null +++ b/packages/poml/reader/lexer.ts @@ -0,0 +1,92 @@ +import { createToken, Lexer } from 'chevrotain'; + +// Define token types for extended POML +export const Comment = createToken({ name: 'Comment', pattern: // }); +export const TemplateOpen = createToken({ name: 'TemplateOpen', pattern: /{{/ }); +export const TemplateClose = createToken({ name: 'TemplateClose', pattern: /}}/ }); +export const TagClosingOpen = createToken({ name: 'TagClosingOpen', pattern: /<\// }); +export const TagSelfClose = createToken({ name: 'TagSelfClose', pattern: /\/>/ }); +export const TagOpen = createToken({ name: 'TagOpen', pattern: // }); +export const Equals = createToken({ name: 'Equals', pattern: /=/ }); + +// Individual character tokens for quotes and backslash - CST parser will handle semantics +export const DoubleQuote = createToken({ name: 'DoubleQuote', pattern: /"/ }); +export const SingleQuote = createToken({ name: 'SingleQuote', pattern: /'/ }); +export const Backslash = createToken({ name: 'Backslash', pattern: /\\/ }); + +export const Identifier = createToken({ + name: 'Identifier', + pattern: /[a-zA-Z_][a-zA-Z0-9_-]*/ +}); + +export const Whitespace = createToken({ + name: 'Whitespace', + pattern: /[ \t\r\n]+/, + line_breaks: true +}); + +export const TemplateContent = createToken({ + name: 'TemplateContent', + pattern: /[^}]+/, + line_breaks: true +}); + +// Text content - should not consume quotes, backslashes, or tag/template delimiters +export const TextContent = createToken({ + name: 'TextContent', + pattern: /[^<{}"'\\]+/, + line_breaks: true +}); + +// Define token order - more specific patterns first +export const allTokens = [ + Comment, + TemplateOpen, + TemplateClose, + TagClosingOpen, // Must come before TagOpen + TagSelfClose, // Must come before TagClose + TagOpen, + TagClose, + Equals, + DoubleQuote, + SingleQuote, + Backslash, + Identifier, + Whitespace, + TemplateContent, + TextContent +]; + +// Extended POML Lexer class +export class ExtendedPomlLexer { + private lexer: Lexer; + + constructor() { + this.lexer = new Lexer(allTokens); + } + + public tokenize(text: string) { + const lexingResult = this.lexer.tokenize(text); + + if (lexingResult.errors.length > 0) { + console.warn('Lexing errors:', lexingResult.errors); + } + + return { + tokens: lexingResult.tokens, + errors: lexingResult.errors, + groups: lexingResult.groups + }; + } +} + +// Create a single instance to export +export const extendedPomlLexer = new ExtendedPomlLexer(); + +// Export token types for use in parser +export type { + IToken, + ILexingError, + ILexingResult +} from 'chevrotain'; \ No newline at end of file diff --git a/packages/poml/tests/reader/lexer.test.ts b/packages/poml/tests/reader/lexer.test.ts new file mode 100644 index 00000000..8d5c0488 --- /dev/null +++ b/packages/poml/tests/reader/lexer.test.ts @@ -0,0 +1,633 @@ +import { describe, expect, test } from '@jest/globals'; +import { + extendedPomlLexer, + Comment, + TemplateOpen, + TemplateClose, + TagOpen, + TagClose, + TagClosingOpen, + TagSelfClose, + Equals, + DoubleQuote, + SingleQuote, + Backslash, + Identifier, + Whitespace, + TemplateContent, + TextContent +} from '../../reader/lexer'; + +describe('ExtendedPomlLexer', () => { + + describe('Comments', () => { + test('should tokenize HTML comments', () => { + const input = ''; + const result = extendedPomlLexer.tokenize(input); + + expect(result.errors).toHaveLength(0); + expect(result.tokens).toHaveLength(1); + expect(result.tokens[0].tokenType).toBe(Comment); + expect(result.tokens[0].image).toBe(''); + }); + + test('should tokenize multiline comments', () => { + const input = ``; + const result = extendedPomlLexer.tokenize(input); + + expect(result.errors).toHaveLength(0); + expect(result.tokens).toHaveLength(1); + expect(result.tokens[0].tokenType).toBe(Comment); + }); + + test('should tokenize comments with content after', () => { + const input = 'Some text'; + const result = extendedPomlLexer.tokenize(input); + + expect(result.errors).toHaveLength(0); + expect(result.tokens).toHaveLength(2); + expect(result.tokens[0].tokenType).toBe(Comment); + expect(result.tokens[1].tokenType).toBe(TextContent); + expect(result.tokens[1].image).toBe('Some text'); + }); + }); + + describe('Template Variables', () => { + test('should tokenize template variable delimiters', () => { + const input = '{{variable}}'; + const result = extendedPomlLexer.tokenize(input); + + expect(result.errors).toHaveLength(0); + expect(result.tokens).toHaveLength(3); + expect(result.tokens[0].tokenType).toBe(TemplateOpen); + expect(result.tokens[0].image).toBe('{{'); + expect(result.tokens[1].tokenType).toBe(TemplateContent); + expect(result.tokens[1].image).toBe('variable'); + expect(result.tokens[2].tokenType).toBe(TemplateClose); + expect(result.tokens[2].image).toBe('}}'); + }); + + test('should tokenize template variables with complex expressions', () => { + const input = '{{user.name || "Anonymous"}}'; + const result = extendedPomlLexer.tokenize(input); + + expect(result.errors).toHaveLength(0); + expect(result.tokens).toHaveLength(3); + expect(result.tokens[0].tokenType).toBe(TemplateOpen); + expect(result.tokens[1].tokenType).toBe(TemplateContent); + expect(result.tokens[1].image).toBe('user.name || "Anonymous"'); + expect(result.tokens[2].tokenType).toBe(TemplateClose); + }); + + test('should tokenize multiple template variables', () => { + const input = '{{first}} and {{second}}'; + const result = extendedPomlLexer.tokenize(input); + + expect(result.errors).toHaveLength(0); + expect(result.tokens).toHaveLength(6); + expect(result.tokens[0].tokenType).toBe(TemplateOpen); + expect(result.tokens[1].tokenType).toBe(TemplateContent); + expect(result.tokens[1].image).toBe('first'); + expect(result.tokens[2].tokenType).toBe(TemplateClose); + expect(result.tokens[3].tokenType).toBe(TextContent); + expect(result.tokens[3].image).toBe(' and '); + expect(result.tokens[4].tokenType).toBe(TemplateOpen); + expect(result.tokens[5].tokenType).toBe(TemplateContent); + }); + }); + + describe('XML Tags', () => { + test('should tokenize opening tags', () => { + const input = ''; + const result = extendedPomlLexer.tokenize(input); + + expect(result.errors).toHaveLength(0); + expect(result.tokens).toHaveLength(3); + expect(result.tokens[0].tokenType).toBe(TagOpen); + expect(result.tokens[0].image).toBe('<'); + expect(result.tokens[1].tokenType).toBe(Identifier); + expect(result.tokens[1].image).toBe('task'); + expect(result.tokens[2].tokenType).toBe(TagClose); + expect(result.tokens[2].image).toBe('>'); + }); + + test('should tokenize closing tags', () => { + const input = ''; + const result = extendedPomlLexer.tokenize(input); + + expect(result.errors).toHaveLength(0); + expect(result.tokens).toHaveLength(3); + expect(result.tokens[0].tokenType).toBe(TagClosingOpen); + expect(result.tokens[0].image).toBe(''); + }); + + test('should tokenize self-closing tags', () => { + const input = ''; + const result = extendedPomlLexer.tokenize(input); + + expect(result.errors).toHaveLength(0); + expect(result.tokens).toHaveLength(3); + expect(result.tokens[0].tokenType).toBe(TagOpen); + expect(result.tokens[0].image).toBe('<'); + expect(result.tokens[1].tokenType).toBe(Identifier); + expect(result.tokens[1].image).toBe('meta'); + expect(result.tokens[2].tokenType).toBe(TagSelfClose); + expect(result.tokens[2].image).toBe('/>'); + }); + + test('should tokenize tags with attributes', () => { + const input = ''; + const result = extendedPomlLexer.tokenize(input); + + expect(result.errors).toHaveLength(0); + expect(result.tokens.length).toBeGreaterThan(0); + + const tokenTypes = result.tokens.map(t => t.tokenType); + expect(tokenTypes).toContain(TagOpen); + expect(tokenTypes).toContain(Identifier); + expect(tokenTypes).toContain(Equals); + expect(tokenTypes).toContain(DoubleQuote); + expect(tokenTypes).toContain(TagClose); + + // Verify specific tokens exist + expect(result.tokens[0].tokenType).toBe(TagOpen); + expect(result.tokens[0].image).toBe('<'); + + const identifierTokens = result.tokens.filter(t => t.tokenType === Identifier); + expect(identifierTokens.length).toBeGreaterThanOrEqual(3); // task, id, class + expect(identifierTokens[0].image).toBe('task'); + expect(identifierTokens[1].image).toBe('id'); + }); + }); + + describe('Quote and Escape Characters', () => { + test('should tokenize double quotes as individual tokens', () => { + const input = '"Hello world"'; + const result = extendedPomlLexer.tokenize(input); + + expect(result.errors).toHaveLength(0); + expect(result.tokens.length).toBeGreaterThan(0); + + const tokenTypes = result.tokens.map(t => t.tokenType); + expect(tokenTypes).toContain(DoubleQuote); + expect(tokenTypes).toContain(TextContent); + + // First and last tokens should be quotes + expect(result.tokens[0].tokenType).toBe(DoubleQuote); + expect(result.tokens[0].image).toBe('"'); + expect(result.tokens[result.tokens.length - 1].tokenType).toBe(DoubleQuote); + expect(result.tokens[result.tokens.length - 1].image).toBe('"'); + }); + + test('should tokenize single quotes as individual tokens', () => { + const input = "'Hello world'"; + const result = extendedPomlLexer.tokenize(input); + + expect(result.errors).toHaveLength(0); + expect(result.tokens.length).toBeGreaterThan(0); + + const tokenTypes = result.tokens.map(t => t.tokenType); + expect(tokenTypes).toContain(SingleQuote); + expect(tokenTypes).toContain(TextContent); + + // First and last tokens should be quotes + expect(result.tokens[0].tokenType).toBe(SingleQuote); + expect(result.tokens[0].image).toBe("'"); + expect(result.tokens[result.tokens.length - 1].tokenType).toBe(SingleQuote); + expect(result.tokens[result.tokens.length - 1].image).toBe("'"); + }); + + test('should tokenize backslashes as individual tokens', () => { + const input = 'text\\with\\backslashes'; + const result = extendedPomlLexer.tokenize(input); + + expect(result.errors).toHaveLength(0); + expect(result.tokens.length).toBeGreaterThan(0); + + const tokenTypes = result.tokens.map(t => t.tokenType); + expect(tokenTypes).toContain(Backslash); + expect(tokenTypes).toContain(TextContent); + + // Should have backslash tokens + const backslashTokens = result.tokens.filter(t => t.tokenType === Backslash); + expect(backslashTokens.length).toBe(2); + expect(backslashTokens[0].image).toBe('\\'); + expect(backslashTokens[1].image).toBe('\\'); + }); + + test('should handle mixed quotes and backslashes', () => { + const input = 'text "with \\"escaped\\" quotes"'; + const result = extendedPomlLexer.tokenize(input); + + expect(result.errors).toHaveLength(0); + expect(result.tokens.length).toBeGreaterThan(0); + + const tokenTypes = result.tokens.map(t => t.tokenType); + expect(tokenTypes).toContain(DoubleQuote); + expect(tokenTypes).toContain(Backslash); + expect(tokenTypes).toContain(TextContent); + }); + }); + + describe('Identifiers', () => { + test('should tokenize simple identifiers', () => { + const input = 'task'; + const result = extendedPomlLexer.tokenize(input); + + expect(result.errors).toHaveLength(0); + expect(result.tokens).toHaveLength(1); + expect(result.tokens[0].tokenType).toBe(Identifier); + expect(result.tokens[0].image).toBe('task'); + }); + + test('should tokenize identifiers with hyphens', () => { + const input = 'my-component'; + const result = extendedPomlLexer.tokenize(input); + + expect(result.errors).toHaveLength(0); + expect(result.tokens).toHaveLength(1); + expect(result.tokens[0].tokenType).toBe(Identifier); + expect(result.tokens[0].image).toBe('my-component'); + }); + + test('should tokenize identifiers with underscores', () => { + const input = 'my_component'; + const result = extendedPomlLexer.tokenize(input); + + expect(result.errors).toHaveLength(0); + expect(result.tokens).toHaveLength(1); + expect(result.tokens[0].tokenType).toBe(Identifier); + expect(result.tokens[0].image).toBe('my_component'); + }); + + test('should tokenize identifiers with numbers', () => { + const input = 'component123'; + const result = extendedPomlLexer.tokenize(input); + + expect(result.errors).toHaveLength(0); + expect(result.tokens).toHaveLength(1); + expect(result.tokens[0].tokenType).toBe(Identifier); + expect(result.tokens[0].image).toBe('component123'); + }); + }); + + describe('Text Content', () => { + test('should tokenize plain text', () => { + const input = 'This is some plain text'; + const result = extendedPomlLexer.tokenize(input); + + expect(result.errors).toHaveLength(0); + expect(result.tokens).toHaveLength(1); + expect(result.tokens[0].tokenType).toBe(TextContent); + expect(result.tokens[0].image).toBe('This is some plain text'); + }); + + test('should tokenize text with newlines', () => { + const input = `Line 1 +Line 2 +Line 3`; + const result = extendedPomlLexer.tokenize(input); + + expect(result.errors).toHaveLength(0); + expect(result.tokens).toHaveLength(1); + expect(result.tokens[0].tokenType).toBe(TextContent); + expect(result.tokens[0].image).toBe(`Line 1 +Line 2 +Line 3`); + }); + + test('should stop text content at tags', () => { + const input = 'Some text '; + const result = extendedPomlLexer.tokenize(input); + + expect(result.errors).toHaveLength(0); + expect(result.tokens).toHaveLength(4); + expect(result.tokens[0].tokenType).toBe(TextContent); + expect(result.tokens[0].image).toBe('Some text '); + expect(result.tokens[1].tokenType).toBe(TagOpen); + expect(result.tokens[2].tokenType).toBe(Identifier); + expect(result.tokens[3].tokenType).toBe(TagClose); + }); + + test('should stop text content at template variables', () => { + const input = 'Some text {{variable}}'; + const result = extendedPomlLexer.tokenize(input); + + expect(result.errors).toHaveLength(0); + expect(result.tokens).toHaveLength(4); + expect(result.tokens[0].tokenType).toBe(TextContent); + expect(result.tokens[0].image).toBe('Some text '); + expect(result.tokens[1].tokenType).toBe(TemplateOpen); + expect(result.tokens[2].tokenType).toBe(TemplateContent); + expect(result.tokens[3].tokenType).toBe(TemplateClose); + }); + }); + + describe('Complex Mixed Content', () => { + test('should tokenize extended POML example from specification', () => { + const input = `# My Analysis Document + +This is a regular markdown document. + + + Analyze the following data and provide insights. + + +Here are some key points: +- Data quality +- Statistical significance + +{{variable_will_be_substituted}}`; + + const result = extendedPomlLexer.tokenize(input); + + expect(result.errors).toHaveLength(0); + expect(result.tokens.length).toBeGreaterThan(0); + + // Check for presence of different token types + const tokenTypes = result.tokens.map(t => t.tokenType); + expect(tokenTypes).toContain(TextContent); + expect(tokenTypes).toContain(TagOpen); + expect(tokenTypes).toContain(TagClose); + expect(tokenTypes).toContain(Identifier); + expect(tokenTypes).toContain(TemplateOpen); + expect(tokenTypes).toContain(TemplateClose); + expect(tokenTypes).toContain(TemplateContent); + }); + + test('should tokenize comments with tags and templates', () => { + const input = '{{content}}'; + const result = extendedPomlLexer.tokenize(input); + + expect(result.errors).toHaveLength(0); + expect(result.tokens.length).toBeGreaterThan(0); + + // First token should be comment + expect(result.tokens[0].tokenType).toBe(Comment); + }); + + test('should handle self-closing tags with attributes', () => { + const input = ''; + const result = extendedPomlLexer.tokenize(input); + + expect(result.errors).toHaveLength(0); + expect(result.tokens.length).toBeGreaterThan(0); + + const tokenTypes = result.tokens.map(t => t.tokenType); + expect(tokenTypes).toContain(TagOpen); + expect(tokenTypes).toContain(Identifier); + expect(tokenTypes).toContain(Equals); + expect(tokenTypes).toContain(DoubleQuote); + expect(tokenTypes).toContain(TemplateOpen); + expect(tokenTypes).toContain(TemplateClose); + expect(tokenTypes).toContain(TagSelfClose); + }); + + test('should handle the specific case: "abcdefghi"', () => { + const input = '"abcdefghi"'; + const result = extendedPomlLexer.tokenize(input); + + expect(result.errors).toHaveLength(0); + expect(result.tokens.length).toBeGreaterThan(0); + + // Should tokenize as: " abc < poml > def ghi " + const tokenTypes = result.tokens.map(t => t.tokenType); + expect(tokenTypes).toContain(DoubleQuote); + expect(tokenTypes).toContain(TextContent); + expect(tokenTypes).toContain(TagOpen); + expect(tokenTypes).toContain(Identifier); + expect(tokenTypes).toContain(TagClose); + expect(tokenTypes).toContain(TagClosingOpen); + + // First and last tokens should be quotes + expect(result.tokens[0].tokenType).toBe(DoubleQuote); + expect(result.tokens[0].image).toBe('"'); + expect(result.tokens[result.tokens.length - 1].tokenType).toBe(DoubleQuote); + expect(result.tokens[result.tokens.length - 1].image).toBe('"'); + }); + + test('should handle the attribute case: ghi', () => { + const input = 'ghi'; + const result = extendedPomlLexer.tokenize(input); + + expect(result.errors).toHaveLength(0); + expect(result.tokens.length).toBeGreaterThan(0); + + // Should tokenize as: < poml abc = " def " > ghi + const tokenTypes = result.tokens.map(t => t.tokenType); + expect(tokenTypes).toContain(TagOpen); + expect(tokenTypes).toContain(Identifier); + expect(tokenTypes).toContain(Equals); + expect(tokenTypes).toContain(DoubleQuote); + expect(tokenTypes).toContain(TextContent); + expect(tokenTypes).toContain(TagClose); + expect(tokenTypes).toContain(TagClosingOpen); + + // Verify the structure + expect(result.tokens[0].tokenType).toBe(TagOpen); + expect(result.tokens[0].image).toBe('<'); + + const identifierTokens = result.tokens.filter(t => t.tokenType === Identifier); + expect(identifierTokens.length).toBeGreaterThanOrEqual(3); // poml, abc, def, poml + expect(identifierTokens[0].image).toBe('poml'); + expect(identifierTokens[1].image).toBe('abc'); + }); + }); + + describe('Whitespace Handling', () => { + test('should preserve whitespace tokens', () => { + const input = ' \t\n \t\n \t\n '; + const result = extendedPomlLexer.tokenize(input); + + expect(result.errors).toHaveLength(0); + + // Whitespace should be preserved + const tokenTypes = result.tokens.map(t => t.tokenType); + expect(tokenTypes).toContain(Whitespace); + expect(tokenTypes).toContain(TagOpen); + expect(tokenTypes).toContain(TagClose); + expect(tokenTypes).toContain(TagClosingOpen); + expect(tokenTypes).toContain(Identifier); + + // Should start with whitespace + expect(result.tokens[0].tokenType).toBe(Whitespace); + expect(result.tokens[0].image).toBe(' \t\n '); + }); + }); + + describe('Error Handling and Source Index Verification', () => { + test('should handle malformed input gracefully with correct source positions', () => { + const input = ' t.tokenType === TemplateOpen); + expect(templateToken).toBeDefined(); + if (templateToken!.startLine !== undefined) { + expect(templateToken!.startLine).toBe(3); + expect(templateToken!.startColumn).toBe(8); // After "line 3 " + } + }); + + test('should verify token boundaries do not overlap', () => { + const input = 'content'; + const result = extendedPomlLexer.tokenize(input); + + expect(result.errors).toHaveLength(0); + expect(result.tokens.length).toBeGreaterThan(0); + + // Sort tokens by start position + const sortedTokens = [...result.tokens].sort((a, b) => a.startOffset - b.startOffset); + + // Verify no overlaps + for (let i = 0; i < sortedTokens.length - 1; i++) { + const current = sortedTokens[i]; + const next = sortedTokens[i + 1]; + expect(current.endOffset).toBeLessThanOrEqual(next.startOffset); + } + }); + + test('should handle special characters in text content', () => { + const input = 'text with @#$%^&*()[]{}|;:,.<>?/~`'; + const result = extendedPomlLexer.tokenize(input); + + expect(result.errors).toHaveLength(0); + expect(result.tokens.length).toBeGreaterThan(0); + + // Should tokenize as text content + const textTokens = result.tokens.filter(t => t.tokenType === TextContent); + expect(textTokens.length).toBeGreaterThan(0); + + // Verify positions are correct + for (const token of textTokens) { + expect(token.startOffset).toBeLessThan(input.length); + expect(token.endOffset).toBeLessThanOrEqual(input.length); + } + }); + }); +}); \ No newline at end of file From 106b1a591bcbbf15916645229c9ae91c9f97c890 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Wed, 16 Jul 2025 21:03:22 +0800 Subject: [PATCH 12/76] . --- packages/poml/tests/reader/lexer.test.ts | 693 ++++++----------------- 1 file changed, 173 insertions(+), 520 deletions(-) diff --git a/packages/poml/tests/reader/lexer.test.ts b/packages/poml/tests/reader/lexer.test.ts index 8d5c0488..a2107a49 100644 --- a/packages/poml/tests/reader/lexer.test.ts +++ b/packages/poml/tests/reader/lexer.test.ts @@ -18,616 +18,269 @@ import { TextContent } from '../../reader/lexer'; +// Helper function to extract token images +function tokenImages(input: string): string[] { + const result = extendedPomlLexer.tokenize(input); + return result.tokens.map(t => t.image); +} + +// Helper function to extract token types +function tokenTypes(input: string): any[] { + const result = extendedPomlLexer.tokenize(input); + return result.tokens.map(t => t.tokenType); +} + +// Helper function to get full tokenization result +function tokenize(input: string) { + return extendedPomlLexer.tokenize(input); +} + describe('ExtendedPomlLexer', () => { - describe('Comments', () => { + describe('Basic Token Images', () => { test('should tokenize HTML comments', () => { - const input = ''; - const result = extendedPomlLexer.tokenize(input); - - expect(result.errors).toHaveLength(0); - expect(result.tokens).toHaveLength(1); - expect(result.tokens[0].tokenType).toBe(Comment); - expect(result.tokens[0].image).toBe(''); + expect(tokenImages('')).toEqual(['']); }); - test('should tokenize multiline comments', () => { - const input = ``; - const result = extendedPomlLexer.tokenize(input); - - expect(result.errors).toHaveLength(0); - expect(result.tokens).toHaveLength(1); - expect(result.tokens[0].tokenType).toBe(Comment); + test('should tokenize template variables', () => { + expect(tokenImages('{{variable}}')).toEqual(['{{', 'variable', '}}']); }); - test('should tokenize comments with content after', () => { - const input = 'Some text'; - const result = extendedPomlLexer.tokenize(input); - - expect(result.errors).toHaveLength(0); - expect(result.tokens).toHaveLength(2); - expect(result.tokens[0].tokenType).toBe(Comment); - expect(result.tokens[1].tokenType).toBe(TextContent); - expect(result.tokens[1].image).toBe('Some text'); + test('should tokenize XML tags', () => { + expect(tokenImages('')).toEqual(['<', 'task', '>']); + expect(tokenImages('')).toEqual(['']); + expect(tokenImages('')).toEqual(['<', 'meta', ' ', '/>']); }); - }); - describe('Template Variables', () => { - test('should tokenize template variable delimiters', () => { - const input = '{{variable}}'; - const result = extendedPomlLexer.tokenize(input); - - expect(result.errors).toHaveLength(0); - expect(result.tokens).toHaveLength(3); - expect(result.tokens[0].tokenType).toBe(TemplateOpen); - expect(result.tokens[0].image).toBe('{{'); - expect(result.tokens[1].tokenType).toBe(TemplateContent); - expect(result.tokens[1].image).toBe('variable'); - expect(result.tokens[2].tokenType).toBe(TemplateClose); - expect(result.tokens[2].image).toBe('}}'); - }); - - test('should tokenize template variables with complex expressions', () => { - const input = '{{user.name || "Anonymous"}}'; - const result = extendedPomlLexer.tokenize(input); - - expect(result.errors).toHaveLength(0); - expect(result.tokens).toHaveLength(3); - expect(result.tokens[0].tokenType).toBe(TemplateOpen); - expect(result.tokens[1].tokenType).toBe(TemplateContent); - expect(result.tokens[1].image).toBe('user.name || "Anonymous"'); - expect(result.tokens[2].tokenType).toBe(TemplateClose); + test('should tokenize quotes and backslashes individually', () => { + expect(tokenImages('"hello"')).toEqual(['"', 'hello', '"']); + expect(tokenImages("'world'")).toEqual(["'", 'world', "'"]); + expect(tokenImages('text\\escape')).toEqual(['text', '\\', 'escape']); }); - test('should tokenize multiple template variables', () => { - const input = '{{first}} and {{second}}'; - const result = extendedPomlLexer.tokenize(input); - - expect(result.errors).toHaveLength(0); - expect(result.tokens).toHaveLength(6); - expect(result.tokens[0].tokenType).toBe(TemplateOpen); - expect(result.tokens[1].tokenType).toBe(TemplateContent); - expect(result.tokens[1].image).toBe('first'); - expect(result.tokens[2].tokenType).toBe(TemplateClose); - expect(result.tokens[3].tokenType).toBe(TextContent); - expect(result.tokens[3].image).toBe(' and '); - expect(result.tokens[4].tokenType).toBe(TemplateOpen); - expect(result.tokens[5].tokenType).toBe(TemplateContent); + test('should tokenize attributes', () => { + expect(tokenImages('id="value"')).toEqual(['id', '=', '"', 'value', '"']); }); - }); - describe('XML Tags', () => { - test('should tokenize opening tags', () => { - const input = ''; - const result = extendedPomlLexer.tokenize(input); - - expect(result.errors).toHaveLength(0); - expect(result.tokens).toHaveLength(3); - expect(result.tokens[0].tokenType).toBe(TagOpen); - expect(result.tokens[0].image).toBe('<'); - expect(result.tokens[1].tokenType).toBe(Identifier); - expect(result.tokens[1].image).toBe('task'); - expect(result.tokens[2].tokenType).toBe(TagClose); - expect(result.tokens[2].image).toBe('>'); - }); - - test('should tokenize closing tags', () => { - const input = ''; - const result = extendedPomlLexer.tokenize(input); - - expect(result.errors).toHaveLength(0); - expect(result.tokens).toHaveLength(3); - expect(result.tokens[0].tokenType).toBe(TagClosingOpen); - expect(result.tokens[0].image).toBe(''); - }); - - test('should tokenize self-closing tags', () => { - const input = ''; - const result = extendedPomlLexer.tokenize(input); - - expect(result.errors).toHaveLength(0); - expect(result.tokens).toHaveLength(3); - expect(result.tokens[0].tokenType).toBe(TagOpen); - expect(result.tokens[0].image).toBe('<'); - expect(result.tokens[1].tokenType).toBe(Identifier); - expect(result.tokens[1].image).toBe('meta'); - expect(result.tokens[2].tokenType).toBe(TagSelfClose); - expect(result.tokens[2].image).toBe('/>'); - }); - - test('should tokenize tags with attributes', () => { - const input = ''; - const result = extendedPomlLexer.tokenize(input); - - expect(result.errors).toHaveLength(0); - expect(result.tokens.length).toBeGreaterThan(0); - - const tokenTypes = result.tokens.map(t => t.tokenType); - expect(tokenTypes).toContain(TagOpen); - expect(tokenTypes).toContain(Identifier); - expect(tokenTypes).toContain(Equals); - expect(tokenTypes).toContain(DoubleQuote); - expect(tokenTypes).toContain(TagClose); - - // Verify specific tokens exist - expect(result.tokens[0].tokenType).toBe(TagOpen); - expect(result.tokens[0].image).toBe('<'); - - const identifierTokens = result.tokens.filter(t => t.tokenType === Identifier); - expect(identifierTokens.length).toBeGreaterThanOrEqual(3); // task, id, class - expect(identifierTokens[0].image).toBe('task'); - expect(identifierTokens[1].image).toBe('id'); + test('should tokenize whitespace', () => { + expect(tokenImages(' \t\n ')).toEqual([' \t\n ']); }); - }); - describe('Quote and Escape Characters', () => { - test('should tokenize double quotes as individual tokens', () => { - const input = '"Hello world"'; - const result = extendedPomlLexer.tokenize(input); - - expect(result.errors).toHaveLength(0); - expect(result.tokens.length).toBeGreaterThan(0); - - const tokenTypes = result.tokens.map(t => t.tokenType); - expect(tokenTypes).toContain(DoubleQuote); - expect(tokenTypes).toContain(TextContent); - - // First and last tokens should be quotes - expect(result.tokens[0].tokenType).toBe(DoubleQuote); - expect(result.tokens[0].image).toBe('"'); - expect(result.tokens[result.tokens.length - 1].tokenType).toBe(DoubleQuote); - expect(result.tokens[result.tokens.length - 1].image).toBe('"'); + test('should tokenize identifiers', () => { + expect(tokenImages('simple-name_123')).toEqual(['simple-name_123']); }); - test('should tokenize single quotes as individual tokens', () => { - const input = "'Hello world'"; - const result = extendedPomlLexer.tokenize(input); - - expect(result.errors).toHaveLength(0); - expect(result.tokens.length).toBeGreaterThan(0); - - const tokenTypes = result.tokens.map(t => t.tokenType); - expect(tokenTypes).toContain(SingleQuote); - expect(tokenTypes).toContain(TextContent); - - // First and last tokens should be quotes - expect(result.tokens[0].tokenType).toBe(SingleQuote); - expect(result.tokens[0].image).toBe("'"); - expect(result.tokens[result.tokens.length - 1].tokenType).toBe(SingleQuote); - expect(result.tokens[result.tokens.length - 1].image).toBe("'"); + test('should tokenize text content', () => { + expect(tokenImages('plain text here')).toEqual(['plain text here']); }); + }); - test('should tokenize backslashes as individual tokens', () => { - const input = 'text\\with\\backslashes'; - const result = extendedPomlLexer.tokenize(input); - - expect(result.errors).toHaveLength(0); - expect(result.tokens.length).toBeGreaterThan(0); - - const tokenTypes = result.tokens.map(t => t.tokenType); - expect(tokenTypes).toContain(Backslash); - expect(tokenTypes).toContain(TextContent); - - // Should have backslash tokens - const backslashTokens = result.tokens.filter(t => t.tokenType === Backslash); - expect(backslashTokens.length).toBe(2); - expect(backslashTokens[0].image).toBe('\\'); - expect(backslashTokens[1].image).toBe('\\'); + describe('Specific Cases from Requirements', () => { + test('should handle "abcdefghi"', () => { + expect(tokenImages('"abcdefghi"')).toEqual([ + '"', 'abc', '<', 'poml', '>', 'def', '', 'ghi', '"' + ]); }); - test('should handle mixed quotes and backslashes', () => { - const input = 'text "with \\"escaped\\" quotes"'; - const result = extendedPomlLexer.tokenize(input); - - expect(result.errors).toHaveLength(0); - expect(result.tokens.length).toBeGreaterThan(0); - - const tokenTypes = result.tokens.map(t => t.tokenType); - expect(tokenTypes).toContain(DoubleQuote); - expect(tokenTypes).toContain(Backslash); - expect(tokenTypes).toContain(TextContent); + test('should handle ghi', () => { + expect(tokenImages('ghi')).toEqual([ + '<', 'poml', ' ', 'abc', '=', '"', 'def', '"', '>', 'ghi', '' + ]); }); - }); - describe('Identifiers', () => { - test('should tokenize simple identifiers', () => { - const input = 'task'; - const result = extendedPomlLexer.tokenize(input); - - expect(result.errors).toHaveLength(0); - expect(result.tokens).toHaveLength(1); - expect(result.tokens[0].tokenType).toBe(Identifier); - expect(result.tokens[0].image).toBe('task'); + test('should handle mixed content', () => { + expect(tokenImages('text {{var}} more')).toEqual([ + 'text ', '{{', 'var', '}}', ' more' + ]); }); - test('should tokenize identifiers with hyphens', () => { - const input = 'my-component'; - const result = extendedPomlLexer.tokenize(input); - - expect(result.errors).toHaveLength(0); - expect(result.tokens).toHaveLength(1); - expect(result.tokens[0].tokenType).toBe(Identifier); - expect(result.tokens[0].image).toBe('my-component'); + test('should handle complex attributes', () => { + expect(tokenImages('')).toEqual([ + '<', 'task', ' ', 'id', '=', '"', '{{', 'value', '}}', '"', ' ', 'class', '=', '"', 'test', '"', '>' + ]); }); - test('should tokenize identifiers with underscores', () => { - const input = 'my_component'; - const result = extendedPomlLexer.tokenize(input); - - expect(result.errors).toHaveLength(0); - expect(result.tokens).toHaveLength(1); - expect(result.tokens[0].tokenType).toBe(Identifier); - expect(result.tokens[0].image).toBe('my_component'); + test('should handle escaped quotes', () => { + expect(tokenImages('text "with \\"escaped\\" quotes"')).toEqual([ + 'text ', '"', 'with ', '\\', '"', 'escaped', '\\', '"', ' quotes', '"' + ]); }); + }); - test('should tokenize identifiers with numbers', () => { - const input = 'component123'; - const result = extendedPomlLexer.tokenize(input); - - expect(result.errors).toHaveLength(0); - expect(result.tokens).toHaveLength(1); - expect(result.tokens[0].tokenType).toBe(Identifier); - expect(result.tokens[0].image).toBe('component123'); + describe('Token Types', () => { + test('should identify correct token types for basic elements', () => { + expect(tokenTypes('')).toEqual([TagOpen, Identifier, TagClose]); + expect(tokenTypes('')).toEqual([TagClosingOpen, Identifier, TagClose]); + expect(tokenTypes('')).toEqual([TagOpen, Identifier, Whitespace, TagSelfClose]); }); - }); - describe('Text Content', () => { - test('should tokenize plain text', () => { - const input = 'This is some plain text'; - const result = extendedPomlLexer.tokenize(input); - - expect(result.errors).toHaveLength(0); - expect(result.tokens).toHaveLength(1); - expect(result.tokens[0].tokenType).toBe(TextContent); - expect(result.tokens[0].image).toBe('This is some plain text'); + test('should identify quotes and backslashes', () => { + expect(tokenTypes('"text"')).toEqual([DoubleQuote, TextContent, DoubleQuote]); + expect(tokenTypes("'text'")).toEqual([SingleQuote, TextContent, SingleQuote]); + expect(tokenTypes('text\\escape')).toEqual([TextContent, Backslash, TextContent]); }); - test('should tokenize text with newlines', () => { - const input = `Line 1 -Line 2 -Line 3`; - const result = extendedPomlLexer.tokenize(input); - - expect(result.errors).toHaveLength(0); - expect(result.tokens).toHaveLength(1); - expect(result.tokens[0].tokenType).toBe(TextContent); - expect(result.tokens[0].image).toBe(`Line 1 -Line 2 -Line 3`); + test('should identify template variables', () => { + expect(tokenTypes('{{variable}}')).toEqual([TemplateOpen, TemplateContent, TemplateClose]); }); - test('should stop text content at tags', () => { - const input = 'Some text '; - const result = extendedPomlLexer.tokenize(input); - - expect(result.errors).toHaveLength(0); - expect(result.tokens).toHaveLength(4); - expect(result.tokens[0].tokenType).toBe(TextContent); - expect(result.tokens[0].image).toBe('Some text '); - expect(result.tokens[1].tokenType).toBe(TagOpen); - expect(result.tokens[2].tokenType).toBe(Identifier); - expect(result.tokens[3].tokenType).toBe(TagClose); + test('should identify comments', () => { + expect(tokenTypes('')).toEqual([Comment]); }); - test('should stop text content at template variables', () => { - const input = 'Some text {{variable}}'; - const result = extendedPomlLexer.tokenize(input); - - expect(result.errors).toHaveLength(0); - expect(result.tokens).toHaveLength(4); - expect(result.tokens[0].tokenType).toBe(TextContent); - expect(result.tokens[0].image).toBe('Some text '); - expect(result.tokens[1].tokenType).toBe(TemplateOpen); - expect(result.tokens[2].tokenType).toBe(TemplateContent); - expect(result.tokens[3].tokenType).toBe(TemplateClose); + test('should identify whitespace', () => { + expect(tokenTypes(' \t\n ')).toEqual([Whitespace]); }); }); - describe('Complex Mixed Content', () => { - test('should tokenize extended POML example from specification', () => { - const input = `# My Analysis Document - -This is a regular markdown document. - - - Analyze the following data and provide insights. - - -Here are some key points: -- Data quality -- Statistical significance - -{{variable_will_be_substituted}}`; - - const result = extendedPomlLexer.tokenize(input); - + describe('Source Position and Error Tests', () => { + test('should provide correct source positions', () => { + const result = tokenize('content'); expect(result.errors).toHaveLength(0); - expect(result.tokens.length).toBeGreaterThan(0); - // Check for presence of different token types - const tokenTypes = result.tokens.map(t => t.tokenType); - expect(tokenTypes).toContain(TextContent); - expect(tokenTypes).toContain(TagOpen); - expect(tokenTypes).toContain(TagClose); - expect(tokenTypes).toContain(Identifier); - expect(tokenTypes).toContain(TemplateOpen); - expect(tokenTypes).toContain(TemplateClose); - expect(tokenTypes).toContain(TemplateContent); - }); - - test('should tokenize comments with tags and templates', () => { - const input = '{{content}}'; - const result = extendedPomlLexer.tokenize(input); + const tokens = result.tokens; + expect(tokens[0].startOffset).toBe(0); + expect(tokens[0].endOffset).toBe(1); + expect(tokens[0].image).toBe('<'); - expect(result.errors).toHaveLength(0); - expect(result.tokens.length).toBeGreaterThan(0); + expect(tokens[1].startOffset).toBe(1); + expect(tokens[1].endOffset).toBe(5); + expect(tokens[1].image).toBe('task'); - // First token should be comment - expect(result.tokens[0].tokenType).toBe(Comment); + expect(tokens[2].startOffset).toBe(5); + expect(tokens[2].endOffset).toBe(6); + expect(tokens[2].image).toBe('>'); }); - test('should handle self-closing tags with attributes', () => { - const input = ''; - const result = extendedPomlLexer.tokenize(input); + test('should handle line and column tracking', () => { + const input = `line1 +line2 +line3`; + const result = tokenize(input); - expect(result.errors).toHaveLength(0); - expect(result.tokens.length).toBeGreaterThan(0); - - const tokenTypes = result.tokens.map(t => t.tokenType); - expect(tokenTypes).toContain(TagOpen); - expect(tokenTypes).toContain(Identifier); - expect(tokenTypes).toContain(Equals); - expect(tokenTypes).toContain(DoubleQuote); - expect(tokenTypes).toContain(TemplateOpen); - expect(tokenTypes).toContain(TemplateClose); - expect(tokenTypes).toContain(TagSelfClose); - }); - - test('should handle the specific case: "abcdefghi"', () => { - const input = '"abcdefghi"'; - const result = extendedPomlLexer.tokenize(input); - - expect(result.errors).toHaveLength(0); - expect(result.tokens.length).toBeGreaterThan(0); - - // Should tokenize as: " abc < poml > def ghi " - const tokenTypes = result.tokens.map(t => t.tokenType); - expect(tokenTypes).toContain(DoubleQuote); - expect(tokenTypes).toContain(TextContent); - expect(tokenTypes).toContain(TagOpen); - expect(tokenTypes).toContain(Identifier); - expect(tokenTypes).toContain(TagClose); - expect(tokenTypes).toContain(TagClosingOpen); - - // First and last tokens should be quotes - expect(result.tokens[0].tokenType).toBe(DoubleQuote); - expect(result.tokens[0].image).toBe('"'); - expect(result.tokens[result.tokens.length - 1].tokenType).toBe(DoubleQuote); - expect(result.tokens[result.tokens.length - 1].image).toBe('"'); + const tagToken = result.tokens.find(t => t.tokenType === TagOpen); + expect(tagToken).toBeDefined(); + expect(tagToken!.startLine).toBe(2); + expect(tagToken!.startColumn).toBe(7); // After "line2 " }); - test('should handle the attribute case: ghi', () => { - const input = 'ghi'; - const result = extendedPomlLexer.tokenize(input); - - expect(result.errors).toHaveLength(0); + test('should handle malformed input gracefully', () => { + const result = tokenize(' ghi - const tokenTypes = result.tokens.map(t => t.tokenType); - expect(tokenTypes).toContain(TagOpen); - expect(tokenTypes).toContain(Identifier); - expect(tokenTypes).toContain(Equals); - expect(tokenTypes).toContain(DoubleQuote); - expect(tokenTypes).toContain(TextContent); - expect(tokenTypes).toContain(TagClose); - expect(tokenTypes).toContain(TagClosingOpen); - - // Verify the structure - expect(result.tokens[0].tokenType).toBe(TagOpen); - expect(result.tokens[0].image).toBe('<'); - - const identifierTokens = result.tokens.filter(t => t.tokenType === Identifier); - expect(identifierTokens.length).toBeGreaterThanOrEqual(3); // poml, abc, def, poml - expect(identifierTokens[0].image).toBe('poml'); - expect(identifierTokens[1].image).toBe('abc'); + // Verify token positions are valid + for (const token of result.tokens) { + expect(token.startOffset).toBeLessThanOrEqual(token.endOffset); + expect(token.startOffset).toBeGreaterThanOrEqual(0); + expect(token.endOffset).toBeLessThanOrEqual(18); + } }); - }); - describe('Whitespace Handling', () => { - test('should preserve whitespace tokens', () => { - const input = ' \t\n \t\n \t\n '; - const result = extendedPomlLexer.tokenize(input); - - expect(result.errors).toHaveLength(0); - - // Whitespace should be preserved - const tokenTypes = result.tokens.map(t => t.tokenType); - expect(tokenTypes).toContain(Whitespace); - expect(tokenTypes).toContain(TagOpen); - expect(tokenTypes).toContain(TagClose); - expect(tokenTypes).toContain(TagClosingOpen); - expect(tokenTypes).toContain(Identifier); + test('should handle special characters with errors', () => { + const result = tokenize('text with @#$%^&*()[]{}|;:,.<>?/~`'); + // Some special characters might cause lexing errors + expect(result.tokens.length).toBeGreaterThan(0); - // Should start with whitespace - expect(result.tokens[0].tokenType).toBe(Whitespace); - expect(result.tokens[0].image).toBe(' \t\n '); + // All tokens should have valid positions + for (const token of result.tokens) { + expect(token.startOffset).toBeLessThan(token.endOffset); + expect(token.image).toBeTruthy(); + } }); - }); - describe('Error Handling and Source Index Verification', () => { - test('should handle malformed input gracefully with correct source positions', () => { - const input = 'content'); + const sortedTokens = [...result.tokens].sort((a, b) => a.startOffset - b.startOffset); - // Verify source positions are correct - let expectedOffset = 0; - for (const token of result.tokens) { - expect(token.startOffset).toBeGreaterThanOrEqual(expectedOffset); - expect(token.endOffset).toBeGreaterThan(token.startOffset); - expect(token.startOffset).toBeLessThan(input.length); - expect(token.endOffset).toBeLessThanOrEqual(input.length); - expectedOffset = token.startOffset; + for (let i = 0; i < sortedTokens.length - 1; i++) { + const current = sortedTokens[i]; + const next = sortedTokens[i + 1]; + expect(current.endOffset).toBeLessThanOrEqual(next.startOffset); } }); test('should handle empty input', () => { - const input = ''; - const result = extendedPomlLexer.tokenize(input); - + const result = tokenize(''); expect(result.errors).toHaveLength(0); expect(result.tokens).toHaveLength(0); }); - test('should handle input with only whitespace and preserve it', () => { - const input = ' \t\n '; - const result = extendedPomlLexer.tokenize(input); - + test('should handle whitespace only input', () => { + const result = tokenize(' \t\n '); expect(result.errors).toHaveLength(0); - expect(result.tokens).toHaveLength(1); // Whitespace is preserved + expect(result.tokens).toHaveLength(1); expect(result.tokens[0].tokenType).toBe(Whitespace); - expect(result.tokens[0].startOffset).toBe(0); - expect(result.tokens[0].endOffset).toBe(input.length); }); + }); - test('should handle unclosed comments gracefully', () => { - const input = ' - expect(result.tokens.length).toBeGreaterThan(0); + describe('Complex Mixed Content', () => { + test('should handle extended POML specification example', () => { + const input = `# My Analysis + + + Analyze data + + +{{variable}}`; - // Verify source positions - for (const token of result.tokens) { - expect(token.startOffset).toBeLessThan(input.length); - expect(token.endOffset).toBeLessThanOrEqual(input.length); - } + const images = tokenImages(input); + expect(images).toContain('# My Analysis\n\n'); + expect(images).toContain('<'); + expect(images).toContain('task'); + expect(images).toContain('>'); + expect(images).toContain('{{'); + expect(images).toContain('variable'); + expect(images).toContain('}}'); }); - test('should handle mixed valid and invalid tokens with correct positions', () => { - const input = '{{template}}invalid@#$%^content'; - const result = extendedPomlLexer.tokenize(input); - - expect(result.errors).toHaveLength(0); - expect(result.tokens.length).toBeGreaterThan(0); - - // Verify all tokens have valid source positions - for (let i = 0; i < result.tokens.length; i++) { - const token = result.tokens[i]; - if (token.startOffset !== undefined && token.endOffset !== undefined) { - expect(token.startOffset).toBeLessThan(input.length); - expect(token.endOffset).toBeLessThanOrEqual(input.length); - expect(token.startOffset).toBeLessThan(token.endOffset); - - // Verify token content matches input at specified positions - const tokenContent = input.substring(token.startOffset, token.endOffset); - expect(token.image).toBe(tokenContent); - } - } + test('should handle comments with mixed content', () => { + expect(tokenImages('content')).toEqual([ + '', '<', 'task', '>', 'content', '' + ]); }); - test('should handle incomplete template variables', () => { - const input = 'text {{incomplete_template'; - const result = extendedPomlLexer.tokenize(input); - - expect(result.errors).toHaveLength(0); - expect(result.tokens.length).toBeGreaterThan(0); - - // Should tokenize text and template open, but template content extends to end - const tokenTypes = result.tokens.map(t => t.tokenType); - expect(tokenTypes).toContain(TextContent); - expect(tokenTypes).toContain(TemplateOpen); - expect(tokenTypes).toContain(TemplateContent); + test('should handle nested quotes and templates', () => { + expect(tokenImages('')).toEqual([ + '<', 'meta', ' ', 'value', '=', '"', '{{', 'path', '}}', '/file.txt', '"', '>' + ]); }); + }); - test('should handle nested incomplete structures', () => { - const input = '{{variablecontent'; - const result = extendedPomlLexer.tokenize(input); - + describe('Error Recovery', () => { + test('should handle incomplete template variables', () => { + const result = tokenize('text {{incomplete'); expect(result.errors).toHaveLength(0); expect(result.tokens.length).toBeGreaterThan(0); - // Verify continuous coverage of input - let coveredLength = 0; - for (const token of result.tokens) { - if (token.tokenType !== Whitespace) { - coveredLength += token.image.length; - } - } - expect(coveredLength).toBeLessThanOrEqual(input.length); + const types = result.tokens.map(t => t.tokenType); + expect(types).toContain(TextContent); + expect(types).toContain(TemplateOpen); + expect(types).toContain(TemplateContent); }); - test('should handle line and column tracking correctly', () => { - const input = `line 1 -line 2 -line 3 {{var}}`; - const result = extendedPomlLexer.tokenize(input); - + test('should handle unclosed comments', () => { + const result = tokenize('')).toEqual(['']); @@ -71,38 +69,108 @@ describe('ExtendedPomlLexer', () => { }); test('should tokenize text content', () => { - expect(tokenImages('plain text here')).toEqual(['plain text here']); + expect(tokenImages('plain text here')).toEqual(['plain', ' ', 'text', ' ', 'here']); }); }); describe('Specific Cases from Requirements', () => { test('should handle "abcdefghi"', () => { expect(tokenImages('"abcdefghi"')).toEqual([ - '"', 'abc', '<', 'poml', '>', 'def', '', 'ghi', '"' + '"', + 'abc', + '<', + 'poml', + '>', + 'def', + '', + 'ghi', + '"' ]); }); test('should handle ghi', () => { expect(tokenImages('ghi')).toEqual([ - '<', 'poml', ' ', 'abc', '=', '"', 'def', '"', '>', 'ghi', '' + '<', + 'poml', + ' ', + 'abc', + '=', + '"', + 'def', + '"', + '>', + 'ghi', + '' ]); }); test('should handle mixed content', () => { expect(tokenImages('text {{var}} more')).toEqual([ - 'text ', '{{', 'var', '}}', ' more' + 'text', + ' ', + '{{', + 'var', + '}}', + ' ', + 'more' + ]); + }); + + test('chinese characters', () => { + expect(tokenImages('中文 {{ 文本 }}内容< 标签>')).toEqual([ + '中文 ', + '{{', + ' ', + '文本 ', + '}}', + '内容', + '<', + ' ', + '标签>' ]); }); test('should handle complex attributes', () => { expect(tokenImages('')).toEqual([ - '<', 'task', ' ', 'id', '=', '"', '{{', 'value', '}}', '"', ' ', 'class', '=', '"', 'test', '"', '>' + '<', + 'task', + ' ', + 'id', + '=', + '"', + '{{', + 'value', + '}}', + '"', + ' ', + 'class', + '=', + '"', + 'test', + '"', + '>' ]); }); test('should handle escaped quotes', () => { expect(tokenImages('text "with \\"escaped\\" quotes"')).toEqual([ - 'text ', '"', 'with ', '\\', '"', 'escaped', '\\', '"', ' quotes', '"' + 'text', + ' ', + '"', + 'with', + ' ', + '\\', + '"', + 'escaped', + '\\', + '"', + ' ', + 'quotes', + '"' ]); }); }); @@ -115,13 +183,13 @@ describe('ExtendedPomlLexer', () => { }); test('should identify quotes and backslashes', () => { - expect(tokenTypes('"text"')).toEqual([DoubleQuote, TextContent, DoubleQuote]); - expect(tokenTypes("'text'")).toEqual([SingleQuote, TextContent, SingleQuote]); - expect(tokenTypes('text\\escape')).toEqual([TextContent, Backslash, TextContent]); + expect(tokenTypes('"text"')).toEqual([DoubleQuote, Identifier, DoubleQuote]); + expect(tokenTypes("'text'")).toEqual([SingleQuote, Identifier, SingleQuote]); + expect(tokenTypes('text\\escape')).toEqual([Identifier, Backslash, Identifier]); }); test('should identify template variables', () => { - expect(tokenTypes('{{variable}}')).toEqual([TemplateOpen, TemplateContent, TemplateClose]); + expect(tokenTypes('{{variable}}')).toEqual([TemplateOpen, Identifier, TemplateClose]); }); test('should identify comments', () => { @@ -137,18 +205,18 @@ describe('ExtendedPomlLexer', () => { test('should provide correct source positions', () => { const result = tokenize('content'); expect(result.errors).toHaveLength(0); - + const tokens = result.tokens; expect(tokens[0].startOffset).toBe(0); - expect(tokens[0].endOffset).toBe(1); + expect(tokens[0].endOffset).toBe(0); expect(tokens[0].image).toBe('<'); - + expect(tokens[1].startOffset).toBe(1); - expect(tokens[1].endOffset).toBe(5); + expect(tokens[1].endOffset).toBe(4); expect(tokens[1].image).toBe('task'); - + expect(tokens[2].startOffset).toBe(5); - expect(tokens[2].endOffset).toBe(6); + expect(tokens[2].endOffset).toBe(5); expect(tokens[2].image).toBe('>'); }); @@ -157,7 +225,7 @@ describe('ExtendedPomlLexer', () => { line2 line3`; const result = tokenize(input); - + const tagToken = result.tokens.find(t => t.tokenType === TagOpen); expect(tagToken).toBeDefined(); expect(tagToken!.startLine).toBe(2); @@ -168,31 +236,19 @@ line3`; const result = tokenize('content'); const sortedTokens = [...result.tokens].sort((a, b) => a.startOffset - b.startOffset); - + for (let i = 0; i < sortedTokens.length - 1; i++) { const current = sortedTokens[i]; const next = sortedTokens[i + 1]; @@ -223,7 +279,7 @@ line3`; {{variable}}`; - + const images = tokenImages(input); expect(images).toContain('# My Analysis\n\n'); expect(images).toContain('<'); @@ -236,13 +292,31 @@ line3`; test('should handle comments with mixed content', () => { expect(tokenImages('content')).toEqual([ - '', '<', 'task', '>', 'content', '' + '', + '<', + 'task', + '>', + 'content', + '' ]); }); test('should handle nested quotes and templates', () => { expect(tokenImages('')).toEqual([ - '<', 'meta', ' ', 'value', '=', '"', '{{', 'path', '}}', '/file.txt', '"', '>' + '<', + 'meta', + ' ', + 'value', + '=', + '"', + '{{', + 'path', + '}}', + '/file.txt', + '"', + '>' ]); }); }); @@ -252,11 +326,10 @@ line3`; const result = tokenize('text {{incomplete'); expect(result.errors).toHaveLength(0); expect(result.tokens.length).toBeGreaterThan(0); - + const types = result.tokens.map(t => t.tokenType); - expect(types).toContain(TextContent); + expect(types).toContain(Identifier); expect(types).toContain(TemplateOpen); - expect(types).toContain(TemplateContent); }); test('should handle unclosed comments', () => { @@ -268,7 +341,7 @@ line3`; test('should handle mixed valid and invalid content', () => { const result = tokenize('content@#$invalid'); expect(result.tokens.length).toBeGreaterThan(0); - + // Should tokenize the valid parts const images = result.tokens.map(t => t.image); expect(images).toContain('<'); @@ -279,8 +352,10 @@ line3`; test('should handle special characters in text content', () => { const input = 'text with @#$%^&*()[]{}|;:,.<>?/~`'; - const images = tokenImages(input); - expect(images).toEqual(['text with @#$%^&*()[]{}|;:,.<>?/~`']); + const result = tokenize(input); + expect(result.errors).toHaveLength(0); + const images = result.tokens.map(t => t.image); + expect(images).toEqual(['text', ' ', 'with', ' ', '@#$%^&*()[]{}|;:,.', '<', '>', '?/~`']); }); }); }); From 3c3b226b60ec54a69556282d42ed68cb4a9d25ff Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Wed, 16 Jul 2025 23:50:03 +0800 Subject: [PATCH 14/76] . --- packages/poml/tests/reader/lexer.test.ts | 628 +++++++++++------------ 1 file changed, 313 insertions(+), 315 deletions(-) diff --git a/packages/poml/tests/reader/lexer.test.ts b/packages/poml/tests/reader/lexer.test.ts index 4e72e763..15da7907 100644 --- a/packages/poml/tests/reader/lexer.test.ts +++ b/packages/poml/tests/reader/lexer.test.ts @@ -34,328 +34,326 @@ function tokenize(input: string) { return extendedPomlLexer.tokenize(input); } -describe('ExtendedPomlLexer', () => { - describe('Basic Token Images', () => { - test('should tokenize HTML comments', () => { - expect(tokenImages('')).toEqual(['']); - }); - - test('should tokenize template variables', () => { - expect(tokenImages('{{variable}}')).toEqual(['{{', 'variable', '}}']); - }); - - test('should tokenize XML tags', () => { - expect(tokenImages('')).toEqual(['<', 'task', '>']); - expect(tokenImages('')).toEqual(['']); - expect(tokenImages('')).toEqual(['<', 'meta', ' ', '/>']); - }); - - test('should tokenize quotes and backslashes individually', () => { - expect(tokenImages('"hello"')).toEqual(['"', 'hello', '"']); - expect(tokenImages("'world'")).toEqual(["'", 'world', "'"]); - expect(tokenImages('text\\escape')).toEqual(['text', '\\', 'escape']); - }); - - test('should tokenize attributes', () => { - expect(tokenImages('id="value"')).toEqual(['id', '=', '"', 'value', '"']); - }); - - test('should tokenize whitespace', () => { - expect(tokenImages(' \t\n ')).toEqual([' \t\n ']); - }); - - test('should tokenize identifiers', () => { - expect(tokenImages('simple-name_123')).toEqual(['simple-name_123']); - }); - - test('should tokenize text content', () => { - expect(tokenImages('plain text here')).toEqual(['plain', ' ', 'text', ' ', 'here']); - }); - }); - - describe('Specific Cases from Requirements', () => { - test('should handle "abcdefghi"', () => { - expect(tokenImages('"abcdefghi"')).toEqual([ - '"', - 'abc', - '<', - 'poml', - '>', - 'def', - '', - 'ghi', - '"' - ]); - }); - - test('should handle ghi', () => { - expect(tokenImages('ghi')).toEqual([ - '<', - 'poml', - ' ', - 'abc', - '=', - '"', - 'def', - '"', - '>', - 'ghi', - '' - ]); - }); - - test('should handle mixed content', () => { - expect(tokenImages('text {{var}} more')).toEqual([ - 'text', - ' ', - '{{', - 'var', - '}}', - ' ', - 'more' - ]); - }); - - test('chinese characters', () => { - expect(tokenImages('中文 {{ 文本 }}内容< 标签>')).toEqual([ - '中文 ', - '{{', - ' ', - '文本 ', - '}}', - '内容', - '<', - ' ', - '标签>' - ]); - }); - - test('should handle complex attributes', () => { - expect(tokenImages('')).toEqual([ - '<', - 'task', - ' ', - 'id', - '=', - '"', - '{{', - 'value', - '}}', - '"', - ' ', - 'class', - '=', - '"', - 'test', - '"', - '>' - ]); - }); - - test('should handle escaped quotes', () => { - expect(tokenImages('text "with \\"escaped\\" quotes"')).toEqual([ - 'text', - ' ', - '"', - 'with', - ' ', - '\\', - '"', - 'escaped', - '\\', - '"', - ' ', - 'quotes', - '"' - ]); - }); - }); - - describe('Token Types', () => { - test('should identify correct token types for basic elements', () => { - expect(tokenTypes('')).toEqual([TagOpen, Identifier, TagClose]); - expect(tokenTypes('')).toEqual([TagClosingOpen, Identifier, TagClose]); - expect(tokenTypes('')).toEqual([TagOpen, Identifier, Whitespace, TagSelfClose]); - }); - - test('should identify quotes and backslashes', () => { - expect(tokenTypes('"text"')).toEqual([DoubleQuote, Identifier, DoubleQuote]); - expect(tokenTypes("'text'")).toEqual([SingleQuote, Identifier, SingleQuote]); - expect(tokenTypes('text\\escape')).toEqual([Identifier, Backslash, Identifier]); - }); - - test('should identify template variables', () => { - expect(tokenTypes('{{variable}}')).toEqual([TemplateOpen, Identifier, TemplateClose]); - }); - - test('should identify comments', () => { - expect(tokenTypes('')).toEqual([Comment]); - }); - - test('should identify whitespace', () => { - expect(tokenTypes(' \t\n ')).toEqual([Whitespace]); - }); - }); - - describe('Source Position and Error Tests', () => { - test('should provide correct source positions', () => { - const result = tokenize('content'); - expect(result.errors).toHaveLength(0); - - const tokens = result.tokens; - expect(tokens[0].startOffset).toBe(0); - expect(tokens[0].endOffset).toBe(0); - expect(tokens[0].image).toBe('<'); - - expect(tokens[1].startOffset).toBe(1); - expect(tokens[1].endOffset).toBe(4); - expect(tokens[1].image).toBe('task'); - - expect(tokens[2].startOffset).toBe(5); - expect(tokens[2].endOffset).toBe(5); - expect(tokens[2].image).toBe('>'); - }); - - test('should handle line and column tracking', () => { - const input = `line1 +describe('Basic Token Images', () => { + test('should tokenize HTML comments', () => { + expect(tokenImages('')).toEqual(['']); + }); + + test('should tokenize template variables', () => { + expect(tokenImages('{{variable}}')).toEqual(['{{', 'variable', '}}']); + }); + + test('should tokenize XML tags', () => { + expect(tokenImages('')).toEqual(['<', 'task', '>']); + expect(tokenImages('')).toEqual(['']); + expect(tokenImages('')).toEqual(['<', 'meta', ' ', '/>']); + }); + + test('should tokenize quotes and backslashes individually', () => { + expect(tokenImages('"hello"')).toEqual(['"', 'hello', '"']); + expect(tokenImages("'world'")).toEqual(["'", 'world', "'"]); + expect(tokenImages('text\\escape')).toEqual(['text', '\\', 'escape']); + }); + + test('should tokenize attributes', () => { + expect(tokenImages('id="value"')).toEqual(['id', '=', '"', 'value', '"']); + }); + + test('should tokenize whitespace', () => { + expect(tokenImages(' \t\n ')).toEqual([' \t\n ']); + }); + + test('should tokenize identifiers', () => { + expect(tokenImages('simple-name_123')).toEqual(['simple-name_123']); + }); + + test('should tokenize text content', () => { + expect(tokenImages('plain text here')).toEqual(['plain', ' ', 'text', ' ', 'here']); + }); +}); + +describe('Edge Cases', () => { + test('should handle "abcdefghi"', () => { + expect(tokenImages('"abcdefghi"')).toEqual([ + '"', + 'abc', + '<', + 'poml', + '>', + 'def', + '', + 'ghi', + '"' + ]); + }); + + test('should handle ghi', () => { + expect(tokenImages('ghi')).toEqual([ + '<', + 'poml', + ' ', + 'abc', + '=', + '"', + 'def', + '"', + '>', + 'ghi', + '' + ]); + }); + + test('should handle mixed content', () => { + expect(tokenImages('text {{var}} more')).toEqual([ + 'text', + ' ', + '{{', + 'var', + '}}', + ' ', + 'more' + ]); + }); + + test('chinese characters', () => { + expect(tokenImages('中文 {{ 文本 }}内容< 标签>')).toEqual([ + '中文 ', + '{{', + ' ', + '文本 ', + '}}', + '内容', + '<', + ' ', + '标签>' + ]); + }); + + test('should handle complex attributes', () => { + expect(tokenImages('')).toEqual([ + '<', + 'task', + ' ', + 'id', + '=', + '"', + '{{', + 'value', + '}}', + '"', + ' ', + 'class', + '=', + '"', + 'test', + '"', + '>' + ]); + }); + + test('should handle escaped quotes', () => { + expect(tokenImages('text "with \\"escaped\\" quotes"')).toEqual([ + 'text', + ' ', + '"', + 'with', + ' ', + '\\', + '"', + 'escaped', + '\\', + '"', + ' ', + 'quotes', + '"' + ]); + }); +}); + +describe('Token Types', () => { + test('should identify correct token types for basic elements', () => { + expect(tokenTypes('')).toEqual([TagOpen, Identifier, TagClose]); + expect(tokenTypes('')).toEqual([TagClosingOpen, Identifier, TagClose]); + expect(tokenTypes('')).toEqual([TagOpen, Identifier, Whitespace, TagSelfClose]); + }); + + test('should identify quotes and backslashes', () => { + expect(tokenTypes('"text"')).toEqual([DoubleQuote, Identifier, DoubleQuote]); + expect(tokenTypes("'text'")).toEqual([SingleQuote, Identifier, SingleQuote]); + expect(tokenTypes('text\\escape')).toEqual([Identifier, Backslash, Identifier]); + }); + + test('should identify template variables', () => { + expect(tokenTypes('{{variable}}')).toEqual([TemplateOpen, Identifier, TemplateClose]); + }); + + test('should identify comments', () => { + expect(tokenTypes('')).toEqual([Comment]); + }); + + test('should identify whitespace', () => { + expect(tokenTypes(' \t\n ')).toEqual([Whitespace]); + }); +}); + +describe('Source Position and Error Tests', () => { + test('should provide correct source positions', () => { + const result = tokenize('content'); + expect(result.errors).toHaveLength(0); + + const tokens = result.tokens; + expect(tokens[0].startOffset).toBe(0); + expect(tokens[0].endOffset).toBe(0); + expect(tokens[0].image).toBe('<'); + + expect(tokens[1].startOffset).toBe(1); + expect(tokens[1].endOffset).toBe(4); + expect(tokens[1].image).toBe('task'); + + expect(tokens[2].startOffset).toBe(5); + expect(tokens[2].endOffset).toBe(5); + expect(tokens[2].image).toBe('>'); + }); + + test('should handle line and column tracking', () => { + const input = `line1 line2 line3`; - const result = tokenize(input); - - const tagToken = result.tokens.find(t => t.tokenType === TagOpen); - expect(tagToken).toBeDefined(); - expect(tagToken!.startLine).toBe(2); - expect(tagToken!.startColumn).toBe(7); // After "line2 " - }); - - test('should handle malformed input gracefully', () => { - const result = tokenize('content'); - const sortedTokens = [...result.tokens].sort((a, b) => a.startOffset - b.startOffset); - - for (let i = 0; i < sortedTokens.length - 1; i++) { - const current = sortedTokens[i]; - const next = sortedTokens[i + 1]; - expect(current.endOffset).toBeLessThanOrEqual(next.startOffset); - } - }); - - test('should handle empty input', () => { - const result = tokenize(''); - expect(result.errors).toHaveLength(0); - expect(result.tokens).toHaveLength(0); - }); - - test('should handle whitespace only input', () => { - const result = tokenize(' \t\n '); - expect(result.errors).toHaveLength(0); - expect(result.tokens).toHaveLength(1); - expect(result.tokens[0].tokenType).toBe(Whitespace); - }); - }); - - describe('Complex Mixed Content', () => { - test('should handle extended POML specification example', () => { - const input = `# My Analysis + const result = tokenize(input); + + const tagToken = result.tokens.find(t => t.tokenType === TagOpen); + expect(tagToken).toBeDefined(); + expect(tagToken!.startLine).toBe(2); + expect(tagToken!.startColumn).toBe(7); // After "line2 " + }); + + test('should handle malformed input gracefully', () => { + const result = tokenize('content'); + const sortedTokens = [...result.tokens].sort((a, b) => a.startOffset - b.startOffset); + + for (let i = 0; i < sortedTokens.length - 1; i++) { + const current = sortedTokens[i]; + const next = sortedTokens[i + 1]; + expect(current.endOffset).toBeLessThanOrEqual(next.startOffset); + } + }); + + test('should handle empty input', () => { + const result = tokenize(''); + expect(result.errors).toHaveLength(0); + expect(result.tokens).toHaveLength(0); + }); + + test('should handle whitespace only input', () => { + const result = tokenize(' \t\n '); + expect(result.errors).toHaveLength(0); + expect(result.tokens).toHaveLength(1); + expect(result.tokens[0].tokenType).toBe(Whitespace); + }); +}); + +describe('Complex Mixed Content', () => { + test('should handle extended POML specification example', () => { + const input = `# My Analysis - Analyze data +Analyze data {{variable}}`; - const images = tokenImages(input); - expect(images).toContain('# My Analysis\n\n'); - expect(images).toContain('<'); - expect(images).toContain('task'); - expect(images).toContain('>'); - expect(images).toContain('{{'); - expect(images).toContain('variable'); - expect(images).toContain('}}'); - }); - - test('should handle comments with mixed content', () => { - expect(tokenImages('content')).toEqual([ - '', - '<', - 'task', - '>', - 'content', - '' - ]); - }); - - test('should handle nested quotes and templates', () => { - expect(tokenImages('')).toEqual([ - '<', - 'meta', - ' ', - 'value', - '=', - '"', - '{{', - 'path', - '}}', - '/file.txt', - '"', - '>' - ]); - }); - }); - - describe('Error Recovery', () => { - test('should handle incomplete template variables', () => { - const result = tokenize('text {{incomplete'); - expect(result.errors).toHaveLength(0); - expect(result.tokens.length).toBeGreaterThan(0); - - const types = result.tokens.map(t => t.tokenType); - expect(types).toContain(Identifier); - expect(types).toContain(TemplateOpen); - }); - - test('should handle unclosed comments', () => { - const result = tokenize('content')).toEqual([ + '', + '<', + 'task', + '>', + 'content', + '' + ]); + }); + + test('should handle nested quotes and templates', () => { + expect(tokenImages('')).toEqual([ + '<', + 'meta', + ' ', + 'value', + '=', + '"', + '{{', + 'path', + '}}', + '/file.txt', + '"', + '>' + ]); + }); +}); + +describe('Error Recovery', () => { + test('should handle incomplete template variables', () => { + const result = tokenize('text {{incomplete'); + expect(result.errors).toHaveLength(0); + expect(result.tokens.length).toBeGreaterThan(0); + + const types = result.tokens.map(t => t.tokenType); + expect(types).toContain(Identifier); + expect(types).toContain(TemplateOpen); + }); + + test('should handle unclosed comments', () => { + const result = tokenize(' + + + {{page.title}} + + + +
+ {{content}} +
+ +`, + + ` + {{description}} + +`, + + `"Complex string with {{variables}} and inside"`, + + `{{#each items}} +
  • + {{name}} +
  • +{{/each}}` + ]; + + realWorldTests.forEach(test => { + const result = tokenize(test); + expect(result.errors).toHaveLength(0); + expect(result.tokens.length).toBeGreaterThan(0); + + // Verify position integrity + result.tokens.forEach(token => { + expect(token.startOffset).toBeGreaterThanOrEqual(0); + expect(token.endOffset).toBeGreaterThanOrEqual(token.startOffset!); + }); + }); + }); + + test('should handle equals sign in various contexts', () => { + const equalsTests = [ + 'attr=value', + 'attr="value"', + "attr='value'", + 'attr={{value}}', + 'first=one second=two', + '=standalone', + 'text=content', + 'a=b=c' + ]; + + equalsTests.forEach(test => { + const result = tokenize(test); + expect(result.errors).toHaveLength(0); + + const equalsTokens = result.tokens.filter(t => t.tokenType.name === 'Equals'); + expect(equalsTokens.length).toBeGreaterThan(0); + }); + }); + + test('should handle edge cases with zero-length matches', () => { + const edgeCases = ['', ' ', '\n', '\t', '\r', '{{}}', '', '<>', '""', "''", '\\']; + + edgeCases.forEach(test => { + const result = tokenize(test); + expect(result.errors).toHaveLength(0); + + if (test === '') { + expect(result.tokens).toHaveLength(0); + } else { + expect(result.tokens.length).toBeGreaterThan(0); + } + }); + }); }); describe('Token Types', () => { @@ -320,6 +390,524 @@ Analyze data }); }); +describe('Boundary Conditions', () => { + test('should handle single character inputs', () => { + expect(tokenize('<').tokens).toHaveLength(1); + expect(tokenize('>').tokens).toHaveLength(1); + expect(tokenize('"').tokens).toHaveLength(1); + expect(tokenize("'").tokens).toHaveLength(1); + expect(tokenize('\\').tokens).toHaveLength(1); + expect(tokenize('=').tokens).toHaveLength(1); + expect(tokenize(' ').tokens).toHaveLength(1); + expect(tokenize('\t').tokens).toHaveLength(1); + expect(tokenize('\n').tokens).toHaveLength(1); + expect(tokenize('a').tokens).toHaveLength(1); + expect(tokenize('_').tokens).toHaveLength(1); + expect(tokenize('1').tokens).toHaveLength(1); + expect(tokenize('@').tokens).toHaveLength(1); + }); + + test('should handle two character edge cases', () => { + expect(tokenImages('{{')).toEqual(['{{']); + expect(tokenImages('}}')).toEqual(['}}']); + expect(tokenImages('')).toEqual(['/>']); + expect(tokenImages('{}')).toEqual(['{}']); + expect(tokenImages('}{')).toEqual(['}{']); + expect(tokenImages('""')).toEqual(['"', '"']); + expect(tokenImages("''")).toEqual(["'", "'"]); + expect(tokenImages('<>')).toEqual(['<', '>']); + }); + + test('should handle minimum valid patterns', () => { + expect(tokenImages('')).toEqual(['']); + expect(tokenImages('')).toEqual(['<', 'a', '>']); + expect(tokenImages('')).toEqual(['']); + expect(tokenImages('')).toEqual(['<', 'a', '/>']); + }); + + test('should handle very long inputs without crashes', () => { + const longText = 'a'.repeat(100000); + const result = tokenize(longText); + expect(result.errors).toHaveLength(0); + expect(result.tokens).toHaveLength(1); + expect(result.tokens[0].image).toBe(longText); + + const longComment = ``; + const commentResult = tokenize(longComment); + expect(commentResult.errors).toHaveLength(0); + expect(commentResult.tokens).toHaveLength(1); + + const longIdentifier = 'a' + 'b'.repeat(10000); + const identifierResult = tokenize(longIdentifier); + expect(identifierResult.errors).toHaveLength(0); + expect(identifierResult.tokens).toHaveLength(1); + }); + + test('should handle maximum practical complexity', () => { + const complexInput = + '<' + + 'tag'.repeat(1000) + + ' attr="' + + 'value'.repeat(1000) + + '">' + + 'content'.repeat(1000) + + ''; + const result = tokenize(complexInput); + expect(result.errors).toHaveLength(0); + expect(result.tokens).toHaveLength(13); + }); + + test('should handle deeply nested structures', () => { + let nested = ''; + for (let i = 0; i < 100; i++) { + nested += ``; + } + nested += 'content'; + for (let i = 99; i >= 0; i--) { + nested += ``; + } + const result = tokenize(nested); + expect(result.errors).toHaveLength(0); + expect(result.tokens).toHaveLength(601); + }); +}); + +describe('Unicode and Special Characters', () => { + test('should handle CJK characters', () => { + expect(tokenImages('你好世界')).toEqual(['你好世界']); + expect(tokenImages('こんにちは')).toEqual(['こんにちは']); + expect(tokenImages('안녕하세요')).toEqual(['안녕하세요']); + }); + + test('should handle emoji and symbols', () => { + expect(tokenImages('Hello 👋 World 🌍')).toEqual(['Hello', ' ', '👋 World 🌍']); + expect(tokenImages('Math: ∑∞π≠∅')).toEqual(['Math', ': ∑∞π≠∅']); + expect(tokenImages('Arrows: ←→↑↓')).toEqual(['Arrows', ': ←→↑↓']); + }); + + test('should handle unicode', () => { + expect(tokenImages('<こんにちは>')).toEqual(['<', 'こんにちは>']); + expect(tokenImages('{{你好}}')).toEqual(['{{', '你好', '}}']); + expect(tokenImages('')).toEqual([ + '<', + 'tag', + ' ', + 'attr', + '=', + '"', + 'caf', + 'é', + '"', + '>' + ]); + }); + + test('should maintain lexer stability with all edge cases', () => { + // Combination of many edge cases + const stressTest = + '\uFEFF\x00\x01\x02\x03\x07content\x08\x09\x0A'; + + const result = tokenize(stressTest); + expect(result.tokens.length).toBeGreaterThan(0); + + // Verify token integrity + result.tokens.forEach(token => { + expect(token.startOffset).toBeGreaterThanOrEqual(0); + if (token.endOffset !== undefined) { + expect(token.endOffset).toBeGreaterThanOrEqual(token.startOffset); + } + }); + }); +}); + +describe('Malformed Patterns', () => { + test('should handle incomplete comments', () => { + expect(tokenize('')).toEqual(['']); + expect(tokenImages('')).toEqual(['']); + expect(tokenImages('more{{ content')).toEqual([ + '', + '<', + 'tag', + '>', + 'more', + '{{', + ' ', + 'content' + ]); + expect(tokenImages("\"quoted textend")).toEqual([ + '"', + 'quoted', + ' ', + 'text', + '<', + 'tag', + ' ', + 'attr', + '=', + "'", + 'mixed', + "'", + '>', + 'end' + ]); + }); + + test('should handle lookahead boundary cases for single braces', () => { + expect(tokenImages('{nottemplate}')).toEqual(['{nottemplate}']); + expect(tokenImages('}notclosing{')).toEqual(['}notclosing{']); + expect(tokenImages('text{more}text')).toEqual(['text', '{more}text']); + expect(tokenImages('before}after')).toEqual(['before', '}after']); + expect(tokenImages('before{after')).toEqual(['before', '{after']); + expect(tokenImages('text } { more')).toEqual(['text', ' ', '} { more']); + }); + + test('should handle greedy vs non-greedy matching', () => { + expect(tokenImages('')).toEqual(['', '']); + expect(tokenImages('{{first}}{{second}}')).toEqual(['{{', 'first', '}}', '{{', 'second', '}}']); + expect(tokenImages('textmore')).toEqual(['text', '', 'more']); + }); +}); + +describe('Position Tracking Accuracy', () => { + test('should track positions accurately across multiple lines', () => { + const input = `line1 +content +{{variable}} +final line`; + const result = tokenize(input); + + const tagOpenToken = result.tokens.find(t => t.image === '<' && t.startLine === 2); + expect(tagOpenToken).toBeDefined(); + expect(tagOpenToken!.startColumn).toBe(1); + + const variableToken = result.tokens.find(t => t.image === 'variable'); + expect(variableToken).toBeDefined(); + expect(variableToken!.startLine).toBe(3); + }); + + test('should track positions accurately with mixed line endings', () => { + const input = 'line1\r\nline2\nline3\r'; + const result = tokenize(input); + + expect(result.tokens.length).toBeGreaterThan(0); + result.tokens.forEach(token => { + expect(token.startOffset).toBeGreaterThanOrEqual(0); + expect(token.endOffset).toBeGreaterThanOrEqual(token.startOffset!); + expect(token.startLine).toBeGreaterThanOrEqual(1); + expect(token.startColumn).toBeGreaterThanOrEqual(1); + }); + }); + + test('should handle position tracking with empty tokens', () => { + const input = '<>""\'\'{{}}< >'; + const result = tokenize(input); + + // Verify all tokens have valid positions + result.tokens.forEach(token => { + expect(token.startOffset).toBeGreaterThanOrEqual(0); + expect(token.endOffset).toBeGreaterThanOrEqual(token.startOffset!); + expect(token.startLine).toBeGreaterThanOrEqual(1); + expect(token.startColumn).toBeGreaterThanOrEqual(1); + }); + }); + + test('should track positions accurately with tabs and mixed whitespace', () => { + const input = '\t\n\t\t\t\tcontent\t\n'; + const result = tokenize(input); + + // Find tokens and verify their positions make sense + const tagOpen = result.tokens.find(t => t.image === '<' && t.startLine === 1); + const innerOpen = result.tokens.find(t => t.image === '<' && t.startLine === 2); + + expect(tagOpen).toBeDefined(); + expect(innerOpen).toBeDefined(); + expect(tagOpen!.startColumn).toBe(2); // After tab + expect(innerOpen!.startColumn).toBe(3); // After two tabs + }); + + test('should verify complete coverage with no gaps', () => { + const input = 'content{{var}}'; + const result = tokenize(input); + + // Sort tokens by start position + const sortedTokens = [...result.tokens].sort((a, b) => a.startOffset! - b.startOffset!); + + // Verify complete coverage + let expectedOffset = 0; + sortedTokens.forEach(token => { + expect(token.startOffset).toBeGreaterThanOrEqual(expectedOffset); + expectedOffset = token.endOffset! + 1; + }); + + // Should cover the entire input + expect(expectedOffset).toBeGreaterThanOrEqual(input.length); + }); + + test('should handle position tracking with comments spanning multiple lines', () => { + const input = `text + +more text`; + + const result = tokenize(input); + const commentToken = result.tokens.find(t => t.tokenType.name === 'Comment'); + + expect(commentToken).toBeDefined(); + expect(commentToken!.startLine).toBe(2); + expect(commentToken!.endLine).toBe(4); + }); + + test('should handle position tracking with carriage returns', () => { + const input = 'line1\r\rcontent\r'; + const result = tokenize(input); + + // Check that line numbers increase correctly + const lines = new Set(result.tokens.map(t => t.startLine)); + expect(lines.size).toBeGreaterThan(1); + + // Verify positions are sequential + result.tokens.forEach(token => { + expect(token.startOffset).toBeGreaterThanOrEqual(0); + expect(token.endOffset).toBeGreaterThanOrEqual(token.startOffset!); + }); + }); +}); + +describe('Performance and Stress Tests', () => { + test('should handle extremely long text content without performance degradation', () => { + const longText = 'a'.repeat(1000000); // 1MB of text + const start = performance.now(); + const result = tokenize(longText); + const end = performance.now(); + + expect(result.errors).toHaveLength(0); + expect(result.tokens).toHaveLength(1); + expect(result.tokens[0].image).toBe(longText); + expect(end - start).toBeLessThan(1000); // Should complete in under 1 second + }); + + test('should handle very long comments efficiently', () => { + const longComment = ``; + const start = performance.now(); + const result = tokenize(longComment); + const end = performance.now(); + + expect(result.errors).toHaveLength(0); + expect(result.tokens).toHaveLength(1); + expect(end - start).toBeLessThan(500); // Should be fast + }); + + test('should handle many small tokens efficiently', () => { + const manyTokens = Array(10000).fill('content').join(' '); + const start = performance.now(); + const result = tokenize(manyTokens); + const end = performance.now(); + + expect(result.errors).toHaveLength(0); + expect(result.tokens.length).toBeGreaterThan(10000); + expect(end - start).toBeLessThan(2000); // Should handle many tokens + }); + + test('should handle deeply nested template variables', () => { + let nested = ''; + for (let i = 0; i < 1000; i++) { + nested += `{{var${i}}}`; + } + + const start = performance.now(); + const result = tokenize(nested); + const end = performance.now(); + + expect(result.errors).toHaveLength(0); + expect(result.tokens.length).toBe(3000); // 1000 * (open + content + close) + expect(end - start).toBeLessThan(1000); + }); + + test('should handle memory efficiently with large repetitive content', () => { + const pattern = '{{content}}'; + const repeated = Array(1000).fill(pattern).join('\n'); + + const start = performance.now(); + const result = tokenize(repeated); + const end = performance.now(); + + expect(result.errors).toHaveLength(0); + expect(result.tokens.length).toBeGreaterThan(5000); + expect(end - start).toBeLessThan(1500); + }); + + test('should handle worst-case regex backtracking scenarios', () => { + // Patterns that could cause regex catastrophic backtracking + const backtrackingTests = [ + 'a'.repeat(10000) + 'b', + '{'.repeat(5000) + '}', + '<'.repeat(1000) + '>', + '"'.repeat(2000), + '', + Array(1000).fill('{{}}').join('') + ]; + + backtrackingTests.forEach(test => { + const start = performance.now(); + const result = tokenize(test); + const end = performance.now(); + + expect(result.errors).toHaveLength(0); + expect(end - start).toBeLessThan(1000); // Should not hang + }); + }); + + test('should maintain linear performance with input size', () => { + const sizes = [1000, 5000, 10000, 20000]; + const times: number[] = []; + + sizes.forEach(size => { + const content = 'x'.repeat(size); + const start = performance.now(); + tokenize(content); + const end = performance.now(); + times.push(end - start); + }); + + // Performance should scale roughly linearly + expect(times[1]).toBeLessThan(times[0] * 10); + expect(times[2]).toBeLessThan(times[1] * 5); + expect(times[3]).toBeLessThan(times[2] * 3); + }); + + test('should handle maximum practical input sizes', () => { + // Test with 10MB of content + const hugeContent = Array(10000).fill('content').join(' '); + expect(hugeContent.length).toBe(10000 * 19 - 1); + + const start = performance.now(); + const result = tokenize(hugeContent); + const end = performance.now(); + + expect(result.errors).toHaveLength(0); + expect(result.tokens.length).toBeGreaterThan(0); + expect(end - start).toBeLessThan(5000); // 5 second max for 10MB + }); +}); + describe('Error Recovery', () => { test('should handle incomplete template variables', () => { const result = tokenize('text {{incomplete'); From c4a946660d5218c5e624ce6f86d514bb0683d47f Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Mon, 21 Jul 2025 17:01:05 +0800 Subject: [PATCH 17/76] update poml extended proposal --- docs/proposals/poml_extended.md | 52 +++++++++++++-------------------- 1 file changed, 21 insertions(+), 31 deletions(-) diff --git a/docs/proposals/poml_extended.md b/docs/proposals/poml_extended.md index 8564b681..95f9dffb 100644 --- a/docs/proposals/poml_extended.md +++ b/docs/proposals/poml_extended.md @@ -103,38 +103,44 @@ There can be some intervening text here as well. Metadatas are information that is useful when parsing and rendering the file, such as context variables, stylesheets, version information, file paths, etc. File-level metadata can be included at any place of the file in a special `` tag. This metadata will be processed before any content parsing. +By default, metadata has no child contents. When child contents exist, `` tag must have type to specify what kind of content is provided. **Example:** ```xml - + + + + +{ "foo": "bar" } + ``` ## Architecture Design ### High-level Processing Pipeline -The core of the new architecture is a three-pass process: Tokenization and AST Parsing, Metadata Extraction, and Recursive Rendering. +The core of the new architecture is a three-pass process: Tokenization and AST Parsing, and Recursive Rendering. #### I. Tokenization and AST Parsing -This phase processes the raw file content through a standard compiling workflow: tokenization followed by parsing to an Abstract Syntax Tree (AST). +This phase processes the raw file content into an Abstract Syntax Tree (AST). It leverages the provided ExtendedPomlLexer. -* **Tokenization**: Standard XML tokenization logic is used to break the input into tokens (tags, text content, attributes, etc.). Additionally, template variables in `{{}}` format are identified and tokenized as special tokens to enable proper parsing and variable substitution. +* **Tokenization**: The ExtendedPomlLexer (using chevrotain) scans the entire input string and breaks it into a flat stream of tokens (TagOpen, Identifier, TextContent, TemplateOpen, etc.). This single lexing pass is sufficient for the entire mixed-content file. The distinction between "text" and "POML" is not made at this stage; it's simply a stream of tokens. +* **AST Parsing Algorithm**: A CST (Concrete Syntax Tree) or AST parser will consume the token stream from the lexer. The parser is stateful, using a `PomlContext` object to track parsing configurations. -* **AST Parsing Algorithm**: - 1. Scan until `<` and tag name is found. - 2. If the tag name is `text`, create a text node and scan until the corresponding `
    ` is found (handling nested POML if present; template variables are not considered here). - 3. If the tag name matches any POML tag from `componentDocs.json`, create a node with the tag name and attributes (template variables `{{}}` in attribute values are parsed as child template nodes). - 4. Within POML tags, if another `text` tag is found, follow the same logic as step 2. - 5. Template variables `{{}}` found within text content or attribute values create TEMPLATE nodes as children. - 6. Close the node when the corresponding closing tag `` is found. + 1. The parser starts in "text mode". It consumes TextContent, TemplateOpen/TemplateClose, and other non-tag tokens, bundling them into TEXT or TEMPLATE nodes. + 2. When a TagOpen (`<`) token is followed by the Identifier "meta", a META node is created. Its attributes are immediately parsed to populate the `PomlContext`. This allows metadata to control the parsing of the remainder of the file (e.g., by enabling new tags). The META node is added to the AST but will be ignored during rendering. + 3. When a TagOpen (`<`) token is followed by an Identifier that matches a known POML component (from componentDocs.json and enabled via PomlContext), the parser switches to "POML mode" and creates a POML node. + 4. In "POML mode," it parses attributes (Identifier, Equals, DoubleQuote/SingleQuote), nested tags, and content until it finds a matching TagClosingOpen (`<`) token. Template variables `{{}}` within attribute values or content are parsed into child TEMPLATE nodes. + 5. If the tag is ``, it creates a POML node for `` itself, but its *children* are parsed by recursively applying the "text mode" logic (step 1), allowing for nested POML within ``. + 6. If a TagOpen is followed by an Identifier that is *not* a known POML component, the parser treats these tokens (`<`, tagname, `>`) as literal text and reverts to "text mode". + 7. The parser closes the current POML node when the corresponding TagClosingOpen (`<`) and Identifier are found. After closing the top-level POML tag, it reverts to "text mode". -* **Error Tolerance**: The parser is designed to be error-tolerant, gracefully handling malformed markup while preserving as much structure as possible. +* **Error Tolerance**: The parser will be designed to be error-tolerant. If a closing tag is missing, it can infer closure at the end of the file or when a new top-level tag begins, logging a diagnostic warning. -* **Source Mapping**: The parser retains source mapping information for each AST node, enabling code intelligence features like hover, go to definition, find references, and auto completion. +* **Source Mapping**: The chevrotain tokens inherently contain offset, line, and column information. This data is directly transferred to the ASTNode during parsing, enabling robust code intelligence features. * **Output**: An AST representing the hierarchical structure of the document, where each node contains source position information and type metadata. @@ -190,14 +196,6 @@ interface ASTNode { } ``` -#### II. Metadata Processing - -Once the AST is built, all `META` nodes are processed. - - * **Extraction**: Traverse the AST to find all `META` nodes. - * **Population**: Parse the content of each `` tag and populate the global `PomlContext` object. - * **Removal**: After processing, `META` nodes are removed from the AST to prevent them from being rendered. - **`PomlContext` Interface**: This context object is the single source of truth for the entire file, passed through all readers. It's mutable, allowing stateful operations like `` to have a file-wide effect. ```typescript @@ -209,7 +207,7 @@ interface PomlContext { } ``` -#### III. Text/POML Dispatching (Recursive Rendering) +#### II. Text/POML Dispatching (Recursive Rendering) Rendering starts at the root of the AST and proceeds recursively. A controller dispatches AST nodes to the appropriate reader. @@ -255,11 +253,3 @@ To achieve this design, the existing `PomlFile` class needs significant refactor 3. **Handling ``**: * The `handleInclude` method should be **removed** from `PomlFile`. Inclusion is now handled at a higher level by the main processing pipeline. When the `PomlReader` encounters an `` tag, it will invoke the entire pipeline (Segmentation, Metadata, Rendering) on the included file and insert the resulting React elements. - -4. **Parsing `TEXT` Placeholders**: - - * The core `parseXmlElement` method needs a new branch to handle the `` placeholder. - * When it encounters this element: - 1. It extracts the `ref` attribute (e.g., `"TEXT_ID_123"`). - 2. It looks up the corresponding raw text from `context.texts`. - 3. It fetches from the `context.texts` map and returns a React element containing the pure text content. From 756f9f859ca5f3c1135fff5ebe0011ffbdd756f2 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Thu, 24 Jul 2025 18:40:28 +0800 Subject: [PATCH 18/76] unverified cst impl --- packages/poml/base.tsx | 4 + packages/poml/reader/cst.ts | 730 +++++++++++++++++++++++++ packages/poml/tests/reader/cst.test.ts | 129 +++++ 3 files changed, 863 insertions(+) create mode 100644 packages/poml/tests/reader/cst.test.ts diff --git a/packages/poml/base.tsx b/packages/poml/base.tsx index e2a54d5c..8d87788a 100644 --- a/packages/poml/base.tsx +++ b/packages/poml/base.tsx @@ -873,3 +873,7 @@ export function findComponentByAliasOrUndefined(alias: string): PomlComponent | export function listComponents() { return ComponentRegistry.instance.listComponents(); } + +export function listComponentAliases() { + return listComponents().map(c => c.getAliases()).flat(); +} diff --git a/packages/poml/reader/cst.ts b/packages/poml/reader/cst.ts index e69de29b..e97d6d80 100644 --- a/packages/poml/reader/cst.ts +++ b/packages/poml/reader/cst.ts @@ -0,0 +1,730 @@ +import { IToken } from 'chevrotain'; +import { + extendedPomlLexer, + TemplateOpen, TemplateClose, TagClosingOpen, TagSelfClose, + TagOpen, TagClose, Equals, DoubleQuote, SingleQuote, + Identifier, Whitespace, TextContent +} from './lexer'; + +import { listComponentAliases } from '../base'; + +// Source position interfaces +export interface SourceRange { + start: number; + end: number; +} + +export interface AttributeInfo { + key: string; + value: (ASTNode & { kind: 'TEXT' | 'TEMPLATE' })[]; + keyRange: SourceRange; + valueRange: SourceRange; + fullRange: SourceRange; +} + +// Core AST node interface +export interface ASTNode { + id: string; + kind: 'META' | 'TEXT' | 'POML' | 'TEMPLATE'; + start: number; + end: number; + content: string; + parent?: ASTNode; + children: ASTNode[]; + + // For POML and META nodes + tagName?: string; + attributes?: AttributeInfo[]; + + // Detailed source positions + openingTag?: { + start: number; + end: number; + nameRange: SourceRange; + }; + + closingTag?: { + start: number; + end: number; + nameRange: SourceRange; + }; + + contentRange?: SourceRange; + + // For TEXT nodes + textSegments?: SourceRange[]; + + // For TEMPLATE nodes + expression?: string; +} + +// Context for parsing configuration +export interface PomlContext { + variables: { [key: string]: any }; + stylesheet: { [key: string]: string }; + minimalPomlVersion?: string; + sourcePath: string; + enabledComponents: Set; + unknownComponentBehavior: 'error' | 'warning' | 'ignore'; +} + +// CST Parser class +export class CSTParser { + private tokens: IToken[]; + private position: number; + private text: string; + private context: PomlContext; + private nodeIdCounter: number; + + constructor(context: PomlContext) { + this.tokens = []; + this.position = 0; + this.text = ''; + this.context = context; + this.nodeIdCounter = 0; + + // Initialize default enabled components (can be extended/disabled via meta tags) + this.context.enabledComponents = new Set(listComponentAliases()); + this.context.unknownComponentBehavior = 'warning'; + } + + private generateId(): string { + return `node_${++this.nodeIdCounter}`; + } + + private currentToken(): IToken | undefined { + return this.tokens[this.position]; + } + + private peekToken(offset: number = 1): IToken | undefined { + return this.tokens[this.position + offset]; + } + + private consumeToken(): IToken | undefined { + if (this.position < this.tokens.length) { + return this.tokens[this.position++]; + } + return undefined; + } + + private skipWhitespace(): void { + while (this.currentToken()?.tokenType === Whitespace) { + this.position++; + } + } + + public parse(text: string): ASTNode { + this.text = text; + const lexResult = extendedPomlLexer.tokenize(text); + this.tokens = lexResult.tokens; + this.position = 0; + + const rootNode: ASTNode = { + id: this.generateId(), + kind: 'TEXT', + start: 0, + end: text.length, + content: text, + children: [], + textSegments: [] + }; + + this.parseDocument(rootNode); + return rootNode; + } + + private parseDocument(rootNode: ASTNode): void { + while (this.position < this.tokens.length) { + const token = this.currentToken(); + if (!token) { + break; + } + + if (token.tokenType === TagOpen) { + const nextToken = this.peekToken(); + if (nextToken?.tokenType === Identifier) { + const tagName = nextToken.image; + + if (tagName === 'meta') { + const metaNode = this.parseMetaTag(); + if (metaNode) { + rootNode.children.push(metaNode); + metaNode.parent = rootNode; + this.processMeta(metaNode); + } + } else if (this.context.enabledComponents.has(tagName)) { + const pomlNode = this.parsePomlElement(); + if (pomlNode) { + rootNode.children.push(pomlNode); + pomlNode.parent = rootNode; + } + } else { + // Unknown tag - treat as text + this.handleUnknownTag(tagName); + const textNode = this.parseTextContent(); + if (textNode) { + rootNode.children.push(textNode); + textNode.parent = rootNode; + } + } + } else { + // Malformed tag - treat as text + const textNode = this.parseTextContent(); + if (textNode) { + rootNode.children.push(textNode); + textNode.parent = rootNode; + } + } + } else { + const textNode = this.parseTextContent(); + if (textNode) { + rootNode.children.push(textNode); + textNode.parent = rootNode; + } + } + } + } + + private parseMetaTag(): ASTNode | null { + const startPos = this.position; + const openTagStart = this.currentToken()?.startOffset || 0; + + this.consumeToken(); // consume '<' + this.skipWhitespace(); + + const nameToken = this.consumeToken(); // consume 'meta' + if (!nameToken || nameToken.image !== 'meta') { + return null; + } + + const nameRange: SourceRange = { + start: nameToken.startOffset || 0, + end: (nameToken.endOffset || 0) + 1 + }; + + this.skipWhitespace(); + + const attributes = this.parseAttributes(); + + this.skipWhitespace(); + + // Check for self-closing or regular closing + const closeToken = this.currentToken(); + let openTagEnd = 0; + let hasContent = false; + + if (closeToken?.tokenType === TagSelfClose) { + this.consumeToken(); // consume '/>' + openTagEnd = (closeToken.endOffset || 0) + 1; + } else if (closeToken?.tokenType === TagClose) { + this.consumeToken(); // consume '>' + openTagEnd = (closeToken.endOffset || 0) + 1; + hasContent = true; + } + + const metaNode: ASTNode = { + id: this.generateId(), + kind: 'META', + start: openTagStart, + end: openTagEnd, // Will be updated if there's content + content: '', + children: [], + tagName: 'meta', + attributes, + openingTag: { + start: openTagStart, + end: openTagEnd, + nameRange + } + }; + + if (hasContent) { + // Parse content until closing tag + while (this.position < this.tokens.length) { + const token = this.currentToken(); + if (token?.tokenType === TagClosingOpen) { + const nextToken = this.peekToken(); + if (nextToken?.tokenType === Identifier && nextToken.image === 'meta') { + break; + } + } + this.position++; + } + + // Parse closing tag + if (this.currentToken()?.tokenType === TagClosingOpen) { + const closingTagStart = this.currentToken()?.startOffset || 0; + this.consumeToken(); // consume '' + + if (closingNameToken && finalClose) { + metaNode.closingTag = { + start: closingTagStart, + end: (finalClose.endOffset || 0) + 1, + nameRange: { + start: closingNameToken.startOffset || 0, + end: (closingNameToken.endOffset || 0) + 1 + } + }; + metaNode.end = (finalClose.endOffset || 0) + 1; + } + } + } + + metaNode.content = this.text.slice(metaNode.start, metaNode.end); + return metaNode; + } + + private parsePomlElement(): ASTNode | null { + const openTagStart = this.currentToken()?.startOffset || 0; + + this.consumeToken(); // consume '<' + this.skipWhitespace(); + + const nameToken = this.consumeToken(); + if (!nameToken) { + return null; + } + + const tagName = nameToken.image; + const nameRange: SourceRange = { + start: nameToken.startOffset || 0, + end: (nameToken.endOffset || 0) + 1 + }; + + this.skipWhitespace(); + + const attributes = this.parseAttributes(); + + this.skipWhitespace(); + + // Check for self-closing or regular closing + const closeToken = this.currentToken(); + let openTagEnd = 0; + let hasContent = false; + + if (closeToken?.tokenType === TagSelfClose) { + this.consumeToken(); // consume '/>' + openTagEnd = (closeToken.endOffset || 0) + 1; + } else if (closeToken?.tokenType === TagClose) { + this.consumeToken(); // consume '>' + openTagEnd = (closeToken.endOffset || 0) + 1; + hasContent = true; + } + + const pomlNode: ASTNode = { + id: this.generateId(), + kind: 'POML', + start: openTagStart, + end: openTagEnd, // Will be updated if there's content + content: '', + children: [], + tagName, + attributes, + openingTag: { + start: openTagStart, + end: openTagEnd, + nameRange + } + }; + + if (hasContent) { + if (tagName === 'text') { + // Special handling for tags - parse content as pure text + this.parseTextContentForTextTag(pomlNode); + } else { + // Parse mixed content (POML and text) + this.parseMixedContent(pomlNode); + } + + // Parse closing tag + if (this.currentToken()?.tokenType === TagClosingOpen) { + const closingTagStart = this.currentToken()?.startOffset || 0; + this.consumeToken(); // consume '' + + if (closingNameToken && finalClose) { + pomlNode.closingTag = { + start: closingTagStart, + end: (finalClose.endOffset || 0) + 1, + nameRange: { + start: closingNameToken.startOffset || 0, + end: (closingNameToken.endOffset || 0) + 1 + } + }; + pomlNode.end = (finalClose.endOffset || 0) + 1; + } + } + } + + pomlNode.content = this.text.slice(pomlNode.start, pomlNode.end); + return pomlNode; + } + + private parseTextContentForTextTag(parentNode: ASTNode): void { + // In tags, we parse content as pure text but still need to handle nested POML + while (this.position < this.tokens.length) { + const token = this.currentToken(); + if (!token) { + break; + } + + if (token.tokenType === TagClosingOpen) { + const nextToken = this.peekToken(); + if (nextToken?.tokenType === Identifier && nextToken.image === parentNode.tagName) { + break; // Found closing tag + } + } + + if (token.tokenType === TagOpen) { + const nextToken = this.peekToken(); + if (nextToken?.tokenType === Identifier && this.context.enabledComponents.has(nextToken.image)) { + // Found nested POML element + const nestedNode = this.parsePomlElement(); + if (nestedNode) { + parentNode.children.push(nestedNode); + nestedNode.parent = parentNode; + } + } else { + // Treat as text + const textNode = this.parseTextContent(); + if (textNode) { + parentNode.children.push(textNode); + textNode.parent = parentNode; + } + } + } else { + const textNode = this.parseTextContent(); + if (textNode) { + parentNode.children.push(textNode); + textNode.parent = parentNode; + } + } + } + } + + private parseMixedContent(parentNode: ASTNode): void { + while (this.position < this.tokens.length) { + const token = this.currentToken(); + if (!token) { + break; + } + + if (token.tokenType === TagClosingOpen) { + const nextToken = this.peekToken(); + if (nextToken?.tokenType === Identifier && nextToken.image === parentNode.tagName) { + break; // Found closing tag + } + } + + if (token.tokenType === TagOpen) { + const nextToken = this.peekToken(); + if (nextToken?.tokenType === Identifier && this.context.enabledComponents.has(nextToken.image)) { + // Found nested POML element + const nestedNode = this.parsePomlElement(); + if (nestedNode) { + parentNode.children.push(nestedNode); + nestedNode.parent = parentNode; + } + } else { + // Unknown tag or malformed - treat as text + const textNode = this.parseTextContent(); + if (textNode) { + parentNode.children.push(textNode); + textNode.parent = parentNode; + } + } + } else if (token.tokenType === TemplateOpen) { + // Parse template expression + const templateNode = this.parseTemplate(); + if (templateNode) { + parentNode.children.push(templateNode); + templateNode.parent = parentNode; + } + } else { + const textNode = this.parseTextContent(); + if (textNode) { + parentNode.children.push(textNode); + textNode.parent = parentNode; + } + } + } + } + + private parseTextContent(): ASTNode | null { + const startOffset = this.currentToken()?.startOffset || 0; + let endOffset = startOffset; + + // Collect consecutive text tokens + while (this.position < this.tokens.length) { + const token = this.currentToken(); + if (!token) { + break; + } + + if (token.tokenType === TextContent || token.tokenType === Whitespace) { + endOffset = (token.endOffset || 0) + 1; + this.position++; + } else if (token.tokenType === TagOpen || token.tokenType === TemplateOpen || token.tokenType === TagClosingOpen) { + break; + } else { + // Other tokens treated as text in this context + endOffset = (token.endOffset || 0) + 1; + this.position++; + } + } + + if (endOffset === startOffset) { + return null; + } + + const textNode: ASTNode = { + id: this.generateId(), + kind: 'TEXT', + start: startOffset, + end: endOffset, + content: this.text.slice(startOffset, endOffset), + children: [], + textSegments: [{ start: startOffset, end: endOffset }] + }; + + return textNode; + } + + private parseTemplate(): ASTNode | null { + const startToken = this.currentToken(); + if (!startToken || startToken.tokenType !== TemplateOpen) { + return null; + } + + const startOffset = startToken.startOffset || 0; + this.consumeToken(); // consume '{{' + + let expression = ''; + let endOffset = startOffset + 2; + + // Collect content until TemplateClose + while (this.position < this.tokens.length) { + const token = this.currentToken(); + if (!token) { + break; + } + + if (token.tokenType === TemplateClose) { + endOffset = (token.endOffset || 0) + 1; + this.consumeToken(); + break; + } else { + expression += token.image; + endOffset = (token.endOffset || 0) + 1; + this.consumeToken(); + } + } + + const templateNode: ASTNode = { + id: this.generateId(), + kind: 'TEMPLATE', + start: startOffset, + end: endOffset, + content: this.text.slice(startOffset, endOffset), + children: [], + expression: expression.trim() + }; + + return templateNode; + } + + private parseAttributes(): AttributeInfo[] { + const attributes: AttributeInfo[] = []; + + while (this.position < this.tokens.length) { + this.skipWhitespace(); + + const token = this.currentToken(); + if (!token || token.tokenType !== Identifier) { + break; + } + + const keyToken = this.consumeToken()!; + const keyRange: SourceRange = { + start: keyToken.startOffset || 0, + end: (keyToken.endOffset || 0) + 1 + }; + + this.skipWhitespace(); + + if (this.currentToken()?.tokenType !== Equals) { + // Boolean attribute + attributes.push({ + key: keyToken.image, + value: [{ + id: this.generateId(), + kind: 'TEXT', + start: keyRange.start, + end: keyRange.end, + content: 'true', + children: [] + }], + keyRange, + valueRange: keyRange, + fullRange: keyRange + }); + continue; + } + + this.consumeToken(); // consume '=' + this.skipWhitespace(); + + const quoteToken = this.currentToken(); + if (!quoteToken || (quoteToken.tokenType !== DoubleQuote && quoteToken.tokenType !== SingleQuote)) { + break; // Invalid attribute + } + + const isDoubleQuote = quoteToken.tokenType === DoubleQuote; + const valueStart = (quoteToken.endOffset || 0) + 1; + this.consumeToken(); // consume opening quote + + const valueNodes: (ASTNode & { kind: 'TEXT' | 'TEMPLATE' })[] = []; + let valueEnd = valueStart; + + // Parse attribute value content + while (this.position < this.tokens.length) { + const token = this.currentToken(); + if (!token) { + break; + } + + if ((isDoubleQuote && token.tokenType === DoubleQuote) || + (!isDoubleQuote && token.tokenType === SingleQuote)) { + valueEnd = token.startOffset || valueEnd; + this.consumeToken(); // consume closing quote + break; + } else if (token.tokenType === TemplateOpen) { + const templateNode = this.parseTemplate(); + if (templateNode && (templateNode.kind === 'TEXT' || templateNode.kind === 'TEMPLATE')) { + valueNodes.push(templateNode as ASTNode & { kind: 'TEXT' | 'TEMPLATE' }); + } + } else { + // Collect text content + const textStart = token.startOffset || 0; + let textEnd = (token.endOffset || 0) + 1; + let textContent = token.image; + + this.consumeToken(); + + // Collect more text tokens + while (this.position < this.tokens.length) { + const nextToken = this.currentToken(); + if (!nextToken) { + break; + } + + if ((isDoubleQuote && nextToken.tokenType === DoubleQuote) || + (!isDoubleQuote && nextToken.tokenType === SingleQuote) || + nextToken.tokenType === TemplateOpen) { + break; + } + + textContent += nextToken.image; + textEnd = (nextToken.endOffset || 0) + 1; + this.consumeToken(); + } + + valueNodes.push({ + id: this.generateId(), + kind: 'TEXT', + start: textStart, + end: textEnd, + content: textContent, + children: [] + }); + } + } + + const valueRange: SourceRange = { start: valueStart, end: valueEnd }; + const fullRange: SourceRange = { + start: keyRange.start, + end: (this.tokens[this.position - 1]?.endOffset || 0) + 1 + }; + + attributes.push({ + key: keyToken.image, + value: valueNodes, + keyRange, + valueRange, + fullRange + }); + } + + return attributes; + } + + private processMeta(metaNode: ASTNode): void { + if (!metaNode.attributes) { + return; + } + + for (const attr of metaNode.attributes) { + switch (attr.key) { + case 'components': + this.processComponentsAttribute(attr.value); + break; + case 'unknownComponents': + const behavior = attr.value[0]?.content; + if (behavior === 'error' || behavior === 'warning' || behavior === 'ignore') { + this.context.unknownComponentBehavior = behavior; + } + break; + case 'minimalPomlVersion': + this.context.minimalPomlVersion = attr.value[0]?.content; + break; + // Add other meta attributes as needed + } + } + } + + private processComponentsAttribute(value: (ASTNode & { kind: 'TEXT' | 'TEMPLATE' })[]): void { + const components = value[0]?.content || ''; + const parts = components.split(',').map(s => s.trim()); + + for (const part of parts) { + if (part.startsWith('+')) { + this.context.enabledComponents.add(part.slice(1)); + } else if (part.startsWith('-')) { + this.context.enabledComponents.delete(part.slice(1)); + } + } + } + + private handleUnknownTag(tagName: string): void { + switch (this.context.unknownComponentBehavior) { + case 'error': + throw new Error(`Unknown POML component: ${tagName}`); + case 'warning': + console.warn(`Unknown POML component: ${tagName}`); + break; + case 'ignore': + // Do nothing + break; + } + } +} + +// Export function to create and use the parser +export function parseExtendedPoml(text: string, context: Partial = {}): ASTNode { + const fullContext: PomlContext = { + variables: {}, + stylesheet: {}, + sourcePath: '', + enabledComponents: new Set(), + unknownComponentBehavior: 'warning', + ...context + }; + + const parser = new CSTParser(fullContext); + return parser.parse(text); +} \ No newline at end of file diff --git a/packages/poml/tests/reader/cst.test.ts b/packages/poml/tests/reader/cst.test.ts new file mode 100644 index 00000000..1e9e5ad6 --- /dev/null +++ b/packages/poml/tests/reader/cst.test.ts @@ -0,0 +1,129 @@ +import { describe, expect, test } from '@jest/globals'; +import { parseExtendedPoml, ASTNode } from 'poml/reader/cst'; + +describe('Extended POML CST Parser', () => { + test('parses pure text content', () => { + const input = 'This is plain text content.'; + const result = parseExtendedPoml(input); + + expect(result.kind).toBe('TEXT'); + expect(result.content).toBe(input); + expect(result.children).toHaveLength(0); + }); + + test('parses simple POML element', () => { + const input = 'Analyze the data'; + const result = parseExtendedPoml(input); + + expect(result.kind).toBe('TEXT'); + expect(result.children).toHaveLength(1); + + const taskNode = result.children[0]; + expect(taskNode.kind).toBe('POML'); + expect(taskNode.tagName).toBe('task'); + expect(taskNode.children).toHaveLength(1); + expect(taskNode.children[0].content).toBe('Analyze the data'); + }); + + test('parses mixed content', () => { + const input = `# My Document + +This is regular text. + + + Process this data + + +More text here.`; + + const result = parseExtendedPoml(input); + + expect(result.kind).toBe('TEXT'); + expect(result.children.length).toBeGreaterThan(1); + + // Should have text nodes and POML nodes + const pomlNodes = result.children.filter(child => child.kind === 'POML'); + expect(pomlNodes).toHaveLength(1); + expect(pomlNodes[0].tagName).toBe('task'); + }); + + test('parses self-closing elements', () => { + const input = ''; + const result = parseExtendedPoml(input); + + expect(result.children).toHaveLength(1); + const metaNode = result.children[0]; + expect(metaNode.kind).toBe('META'); + expect(metaNode.tagName).toBe('meta'); + expect(metaNode.attributes).toHaveLength(1); + expect(metaNode.attributes![0].key).toBe('components'); + }); + + test('parses template expressions', () => { + const input = 'Hello {{name}}!'; + const result = parseExtendedPoml(input); + + expect(result.children.length).toBeGreaterThan(1); + const templateNode = result.children.find(child => child.kind === 'TEMPLATE'); + expect(templateNode).toBeDefined(); + expect(templateNode!.expression).toBe('name'); + }); + + test('parses attributes with mixed content', () => { + const input = '

    Content

    '; + const result = parseExtendedPoml(input); + + const pNode = result.children.find(child => child.kind === 'POML'); + expect(pNode).toBeDefined(); + expect(pNode!.attributes).toHaveLength(2); + + const classAttr = pNode!.attributes!.find(attr => attr.key === 'class'); + expect(classAttr).toBeDefined(); + expect(classAttr!.value[0].content).toBe('header'); + + const idAttr = pNode!.attributes!.find(attr => attr.key === 'id'); + expect(idAttr).toBeDefined(); + expect(idAttr!.value[0].kind).toBe('TEMPLATE'); + }); + + test('handles text tag with nested POML', () => { + const input = ` +This is **markdown** content. +This is nested POML +More markdown here. +`; + + const result = parseExtendedPoml(input); + const textNode = result.children.find(child => child.kind === 'POML' && child.tagName === 'text'); + + expect(textNode).toBeDefined(); + expect(textNode!.children.length).toBeGreaterThan(1); + + const cpNode = textNode!.children.find(child => child.kind === 'POML' && child.tagName === 'cp'); + expect(cpNode).toBeDefined(); + }); + + test('preserves source position information', () => { + const input = 'Test'; + const result = parseExtendedPoml(input); + + const taskNode = result.children[0]; + expect(taskNode.start).toBe(0); + expect(taskNode.end).toBe(input.length); + expect(taskNode.openingTag).toBeDefined(); + expect(taskNode.closingTag).toBeDefined(); + expect(taskNode.openingTag!.nameRange.start).toBeGreaterThan(0); + expect(taskNode.openingTag!.nameRange.end).toBeGreaterThan(taskNode.openingTag!.nameRange.start); + }); + + test('handles unknown components gracefully', () => { + const input = 'This should be treated as text'; + + // Should not throw by default (warning behavior) + const result = parseExtendedPoml(input); + expect(result).toBeDefined(); + + // Should treat unknown tag as text content + expect(result.children.length).toBeGreaterThan(0); + }); +}); \ No newline at end of file From f57785d65017d788b5574f91948f1d999f66ed4f Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Tue, 26 Aug 2025 14:57:17 +0800 Subject: [PATCH 19/76] move to next --- packages/poml/{reader => next}/ast.ts | 177 ++++++++++---------- packages/poml/{reader => next}/base.tsx | 0 packages/poml/next/context.ts | 17 ++ packages/poml/{reader => next}/cst.ts | 164 ++++++++++-------- packages/poml/{reader => next}/index.tsx | 0 packages/poml/{reader => next}/lexer.ts | 12 +- packages/poml/{reader => next}/meta.ts | 0 packages/poml/{reader => next}/poml.tsx | 0 packages/poml/{reader => next}/segment.ts | 79 +++++---- packages/poml/{reader => next}/text.tsx | 0 packages/poml/{reader => next}/tokenizer.ts | 58 +++---- packages/poml/tests/reader/ast.test.ts | 144 ++++++++-------- packages/poml/tests/reader/cst.test.ts | 52 +++--- packages/poml/tests/reader/lexer.test.ts | 114 +++++-------- 14 files changed, 419 insertions(+), 398 deletions(-) rename packages/poml/{reader => next}/ast.ts (79%) rename packages/poml/{reader => next}/base.tsx (100%) create mode 100644 packages/poml/next/context.ts rename packages/poml/{reader => next}/cst.ts (91%) rename packages/poml/{reader => next}/index.tsx (100%) rename packages/poml/{reader => next}/lexer.ts (92%) rename packages/poml/{reader => next}/meta.ts (100%) rename packages/poml/{reader => next}/poml.tsx (100%) rename packages/poml/{reader => next}/segment.ts (91%) rename packages/poml/{reader => next}/text.tsx (100%) rename packages/poml/{reader => next}/tokenizer.ts (82%) diff --git a/packages/poml/reader/ast.ts b/packages/poml/next/ast.ts similarity index 79% rename from packages/poml/reader/ast.ts rename to packages/poml/next/ast.ts index 3b758e1f..c8db6b8a 100644 --- a/packages/poml/reader/ast.ts +++ b/packages/poml/next/ast.ts @@ -9,49 +9,48 @@ export interface SourceRange { export interface AttributeInfo { key: string; - value: (ASTNode & { kind: 'TEXT' | 'TEMPLATE' })[]; // Mixed content: array of text/template nodes - keyRange: SourceRange; // Position of attribute name - valueRange: SourceRange; // Position of attribute value (excluding quotes) - fullRange: SourceRange; // Full attribute including key="value" + value: (ASTNode & { kind: 'TEXT' | 'TEMPLATE' })[]; // Mixed content: array of text/template nodes + keyRange: SourceRange; // Position of attribute name + valueRange: SourceRange; // Position of attribute value (excluding quotes) + fullRange: SourceRange; // Full attribute including key="value" } // Main AST node interface export interface ASTNode { - id: string; // Unique ID for caching and React keys + id: string; // Unique ID for caching and React keys kind: 'META' | 'TEXT' | 'POML' | 'TEMPLATE'; - start: number; // Source position start of entire node - end: number; // Source position end of entire node - content: string; // The raw string content - parent?: ASTNode; // Reference to the parent node - children: ASTNode[]; // Child nodes - + start: number; // Source position start of entire node + end: number; // Source position end of entire node + content: string; // The raw string content + parent?: ASTNode; // Reference to the parent node + children: ASTNode[]; // Child nodes + // For POML and META nodes - tagName?: string; // Tag name (e.g., 'task', 'meta') - attributes?: AttributeInfo[]; // Detailed attribute information - + tagName?: string; // Tag name (e.g., 'task', 'meta') + attributes?: AttributeInfo[]; // Detailed attribute information + // Detailed source positions openingTag?: { - start: number; // Position of '<' - end: number; // Position after '>' - nameRange: SourceRange; // Position of tag name + start: number; // Position of '<' + end: number; // Position after '>' + nameRange: SourceRange; // Position of tag name }; - + closingTag?: { - start: number; // Position of '' - nameRange: SourceRange; // Position of tag name in closing tag + start: number; // Position of '' + nameRange: SourceRange; // Position of tag name in closing tag }; - - contentRange?: SourceRange; // Position of content between tags (excluding nested tags) - + + contentRange?: SourceRange; // Position of content between tags (excluding nested tags) + // For TEXT nodes - textSegments?: SourceRange[]; // Multiple ranges for text content (excluding nested POML) - + textSegments?: SourceRange[]; // Multiple ranges for text content (excluding nested POML) + // For TEMPLATE nodes - expression?: string; // The full expression content between {{}} + expression?: string; // The full expression content between {{}} } - // AST Parser class class ASTParser { private tokens: Token[]; @@ -68,20 +67,25 @@ class ASTParser { private buildValidTagsSet(): Set { const validTags = new Set(); - + for (const doc of componentDocs) { if (doc.name) { validTags.add(doc.name.toLowerCase()); // Convert camelCase to kebab-case - validTags.add(doc.name.toLowerCase().replace(/([A-Z])/g, '-$1').toLowerCase()); + validTags.add( + doc.name + .toLowerCase() + .replace(/([A-Z])/g, '-$1') + .toLowerCase(), + ); } } - + // Add special tags validTags.add('poml'); validTags.add('text'); validTags.add('meta'); - + return validTags; } @@ -108,10 +112,10 @@ class ASTParser { // Parse attribute value for mixed text and template variables const result: (ASTNode & { kind: 'TEXT' | 'TEMPLATE' })[] = []; let currentPos = 0; - + while (currentPos < value.length) { const templateStart = value.indexOf('{{', currentPos); - + if (templateStart === -1) { // No more template variables, add remaining text if (currentPos < value.length) { @@ -121,12 +125,12 @@ class ASTParser { start: currentPos, end: value.length, content: value.substring(currentPos), - children: [] + children: [], }); } break; } - + // Add text before template variable if (templateStart > currentPos) { result.push({ @@ -135,10 +139,10 @@ class ASTParser { start: currentPos, end: templateStart, content: value.substring(currentPos, templateStart), - children: [] + children: [], }); } - + // Find end of template variable const templateEnd = value.indexOf('}}', templateStart + 2); if (templateEnd === -1) { @@ -149,11 +153,11 @@ class ASTParser { start: templateStart, end: value.length, content: value.substring(templateStart), - children: [] + children: [], }); break; } - + // Add template variable const templateContent = value.substring(templateStart + 2, templateEnd); result.push({ @@ -163,78 +167,78 @@ class ASTParser { end: templateEnd + 2, content: value.substring(templateStart, templateEnd + 2), expression: templateContent.trim(), - children: [] + children: [], }); - + currentPos = templateEnd + 2; } - + return result; } private parseAttributes(tagContent: string): AttributeInfo[] { const attributes: AttributeInfo[] = []; - + // Simple attribute parsing - can be enhanced later const attrRegex = /(\w+)=["']([^"']*?)["']/g; let match; - + while ((match = attrRegex.exec(tagContent)) !== null) { const key = match[1]; const value = match[2]; const fullMatch = match[0]; const matchStart = match.index; - + attributes.push({ key, value: this.parseAttributeValue(value), keyRange: { start: matchStart, end: matchStart + key.length }, valueRange: { start: matchStart + key.length + 2, end: matchStart + key.length + 2 + value.length }, - fullRange: { start: matchStart, end: matchStart + fullMatch.length } + fullRange: { start: matchStart, end: matchStart + fullMatch.length }, }); } - + return attributes; } parse(): ASTNode { const children = this.parseNodes(); - + if (children.length === 1 && children[0].kind === 'POML') { return children[0]; } - + // Create root text node const rootNode: ASTNode = { id: this.generateId(), kind: 'TEXT', start: 0, end: this.tokens.length > 0 ? this.tokens[this.tokens.length - 1].end : 0, - content: this.tokens.map(t => t.value).join(''), + content: this.tokens.map((t) => t.value).join(''), children, - textSegments: [] + textSegments: [], }; - + // Set parent references - children.forEach(child => { + children.forEach((child) => { child.parent = rootNode; }); - + return rootNode; } private parseNodes(): ASTNode[] { const nodes: ASTNode[] = []; - + while (this.position < this.tokens.length) { const token = this.peek(); if (!token) break; - + if (token.type === 'TEMPLATE_VAR') { nodes.push(this.parseTemplateVariable()); } else if (token.type === 'TAG_OPEN') { const tagName = this.extractTagName(token.value); - + if (this.validPomlTags.has(tagName.toLowerCase())) { const node = this.parsePomlNode(); if (node) { @@ -251,14 +255,14 @@ class ASTParser { this.advance(); } } - + return nodes; } private parseTemplateVariable(): ASTNode { const token = this.advance()!; const expression = token.value.slice(2, -2).trim(); // Remove {{ and }} - + return { id: this.generateId(), kind: 'TEMPLATE', @@ -266,13 +270,13 @@ class ASTParser { end: token.end, content: token.value, expression, - children: [] + children: [], }; } private parseTextFromToken(): ASTNode { const token = this.advance()!; - + return { id: this.generateId(), kind: 'TEXT', @@ -280,20 +284,20 @@ class ASTParser { end: token.end, content: token.value, children: [], - textSegments: [{ start: token.start, end: token.end }] + textSegments: [{ start: token.start, end: token.end }], }; } private parsePomlNode(): ASTNode | null { const openToken = this.advance()!; const tagName = this.extractTagName(openToken.value); - + // Parse attributes const attributes = this.parseAttributes(openToken.value); - + // Determine node kind const kind = tagName.toLowerCase() === 'meta' ? 'META' : 'POML'; - + const node: ASTNode = { id: this.generateId(), kind, @@ -306,27 +310,27 @@ class ASTParser { openingTag: { start: openToken.start, end: openToken.end, - nameRange: { - start: openToken.start + 1, - end: openToken.start + 1 + tagName.length - } - } + nameRange: { + start: openToken.start + 1, + end: openToken.start + 1 + tagName.length, + }, + }, }; - + // Parse children until we find the closing tag const children: ASTNode[] = []; let depth = 1; - + while (this.position < this.tokens.length && depth > 0) { const token = this.peek(); if (!token) break; - + if (token.type === 'TAG_OPEN') { const childTagName = this.extractTagName(token.value); if (childTagName.toLowerCase() === tagName.toLowerCase()) { depth++; } - + // Special handling for text tags - don't process template variables if (tagName.toLowerCase() === 'text') { children.push(this.parseTextFromToken()); @@ -352,8 +356,8 @@ class ASTParser { end: closeToken.end, nameRange: { start: closeToken.start + 2, - end: closeToken.start + 2 + tagName.length - } + end: closeToken.start + 2 + tagName.length, + }, }; break; } @@ -370,17 +374,20 @@ class ASTParser { children.push(textNode); } } - + node.children = children; - + // Update content to include full tag if (node.closingTag) { - node.content = this.tokens.slice( - this.tokens.findIndex(t => t.start === node.start), - this.tokens.findIndex(t => t.end === node.end) + 1 - ).map(t => t.value).join(''); + node.content = this.tokens + .slice( + this.tokens.findIndex((t) => t.start === node.start), + this.tokens.findIndex((t) => t.end === node.end) + 1, + ) + .map((t) => t.value) + .join(''); } - + return node; } } @@ -397,4 +404,4 @@ export class PomlAstParser { static parse(content: string): ASTNode { return parseAST(content); } -} \ No newline at end of file +} diff --git a/packages/poml/reader/base.tsx b/packages/poml/next/base.tsx similarity index 100% rename from packages/poml/reader/base.tsx rename to packages/poml/next/base.tsx diff --git a/packages/poml/next/context.ts b/packages/poml/next/context.ts new file mode 100644 index 00000000..92939bda --- /dev/null +++ b/packages/poml/next/context.ts @@ -0,0 +1,17 @@ +export class ContextEvaluator { + private contextStore: { [key: string]: any } = {}; + private stack: Array<{ [key: string]: any }> = []; + + public setGlobalVariable(key: string, value: any) { + this.contextStore[key] = value; + } + + public setLocalVariable(key: string, value: any) { + if (this.stack.length === 0) { + throw new Error('No local stack available'); + } + this.stack[this.stack.length - 1][key] = value; + } + + public pushStack() {} +} diff --git a/packages/poml/reader/cst.ts b/packages/poml/next/cst.ts similarity index 91% rename from packages/poml/reader/cst.ts rename to packages/poml/next/cst.ts index e97d6d80..6ede8a39 100644 --- a/packages/poml/reader/cst.ts +++ b/packages/poml/next/cst.ts @@ -1,9 +1,18 @@ import { IToken } from 'chevrotain'; -import { - extendedPomlLexer, - TemplateOpen, TemplateClose, TagClosingOpen, TagSelfClose, - TagOpen, TagClose, Equals, DoubleQuote, SingleQuote, - Identifier, Whitespace, TextContent +import { + extendedPomlLexer, + TemplateOpen, + TemplateClose, + TagClosingOpen, + TagSelfClose, + TagOpen, + TagClose, + Equals, + DoubleQuote, + SingleQuote, + Identifier, + Whitespace, + TextContent, } from './lexer'; import { listComponentAliases } from '../base'; @@ -31,29 +40,29 @@ export interface ASTNode { content: string; parent?: ASTNode; children: ASTNode[]; - + // For POML and META nodes tagName?: string; attributes?: AttributeInfo[]; - + // Detailed source positions openingTag?: { start: number; end: number; nameRange: SourceRange; }; - + closingTag?: { start: number; end: number; nameRange: SourceRange; }; - + contentRange?: SourceRange; - + // For TEXT nodes textSegments?: SourceRange[]; - + // For TEMPLATE nodes expression?: string; } @@ -126,7 +135,7 @@ export class CSTParser { end: text.length, content: text, children: [], - textSegments: [] + textSegments: [], }; this.parseDocument(rootNode); @@ -144,7 +153,7 @@ export class CSTParser { const nextToken = this.peekToken(); if (nextToken?.tokenType === Identifier) { const tagName = nextToken.image; - + if (tagName === 'meta') { const metaNode = this.parseMetaTag(); if (metaNode) { @@ -188,10 +197,10 @@ export class CSTParser { private parseMetaTag(): ASTNode | null { const startPos = this.position; const openTagStart = this.currentToken()?.startOffset || 0; - + this.consumeToken(); // consume '<' this.skipWhitespace(); - + const nameToken = this.consumeToken(); // consume 'meta' if (!nameToken || nameToken.image !== 'meta') { return null; @@ -199,20 +208,20 @@ export class CSTParser { const nameRange: SourceRange = { start: nameToken.startOffset || 0, - end: (nameToken.endOffset || 0) + 1 + end: (nameToken.endOffset || 0) + 1, }; this.skipWhitespace(); - + const attributes = this.parseAttributes(); - + this.skipWhitespace(); - + // Check for self-closing or regular closing const closeToken = this.currentToken(); let openTagEnd = 0; let hasContent = false; - + if (closeToken?.tokenType === TagSelfClose) { this.consumeToken(); // consume '/>' openTagEnd = (closeToken.endOffset || 0) + 1; @@ -234,8 +243,8 @@ export class CSTParser { openingTag: { start: openTagStart, end: openTagEnd, - nameRange - } + nameRange, + }, }; if (hasContent) { @@ -250,7 +259,7 @@ export class CSTParser { } this.position++; } - + // Parse closing tag if (this.currentToken()?.tokenType === TagClosingOpen) { const closingTagStart = this.currentToken()?.startOffset || 0; @@ -258,15 +267,15 @@ export class CSTParser { const closingNameToken = this.consumeToken(); // consume 'meta' this.skipWhitespace(); const finalClose = this.consumeToken(); // consume '>' - + if (closingNameToken && finalClose) { metaNode.closingTag = { start: closingTagStart, end: (finalClose.endOffset || 0) + 1, nameRange: { start: closingNameToken.startOffset || 0, - end: (closingNameToken.endOffset || 0) + 1 - } + end: (closingNameToken.endOffset || 0) + 1, + }, }; metaNode.end = (finalClose.endOffset || 0) + 1; } @@ -279,10 +288,10 @@ export class CSTParser { private parsePomlElement(): ASTNode | null { const openTagStart = this.currentToken()?.startOffset || 0; - + this.consumeToken(); // consume '<' this.skipWhitespace(); - + const nameToken = this.consumeToken(); if (!nameToken) { return null; @@ -291,20 +300,20 @@ export class CSTParser { const tagName = nameToken.image; const nameRange: SourceRange = { start: nameToken.startOffset || 0, - end: (nameToken.endOffset || 0) + 1 + end: (nameToken.endOffset || 0) + 1, }; this.skipWhitespace(); - + const attributes = this.parseAttributes(); - + this.skipWhitespace(); - + // Check for self-closing or regular closing const closeToken = this.currentToken(); let openTagEnd = 0; let hasContent = false; - + if (closeToken?.tokenType === TagSelfClose) { this.consumeToken(); // consume '/>' openTagEnd = (closeToken.endOffset || 0) + 1; @@ -326,8 +335,8 @@ export class CSTParser { openingTag: { start: openTagStart, end: openTagEnd, - nameRange - } + nameRange, + }, }; if (hasContent) { @@ -338,7 +347,7 @@ export class CSTParser { // Parse mixed content (POML and text) this.parseMixedContent(pomlNode); } - + // Parse closing tag if (this.currentToken()?.tokenType === TagClosingOpen) { const closingTagStart = this.currentToken()?.startOffset || 0; @@ -346,15 +355,15 @@ export class CSTParser { const closingNameToken = this.consumeToken(); this.skipWhitespace(); const finalClose = this.consumeToken(); // consume '>' - + if (closingNameToken && finalClose) { pomlNode.closingTag = { start: closingTagStart, end: (finalClose.endOffset || 0) + 1, nameRange: { start: closingNameToken.startOffset || 0, - end: (closingNameToken.endOffset || 0) + 1 - } + end: (closingNameToken.endOffset || 0) + 1, + }, }; pomlNode.end = (finalClose.endOffset || 0) + 1; } @@ -469,7 +478,11 @@ export class CSTParser { if (token.tokenType === TextContent || token.tokenType === Whitespace) { endOffset = (token.endOffset || 0) + 1; this.position++; - } else if (token.tokenType === TagOpen || token.tokenType === TemplateOpen || token.tokenType === TagClosingOpen) { + } else if ( + token.tokenType === TagOpen || + token.tokenType === TemplateOpen || + token.tokenType === TagClosingOpen + ) { break; } else { // Other tokens treated as text in this context @@ -489,7 +502,7 @@ export class CSTParser { end: endOffset, content: this.text.slice(startOffset, endOffset), children: [], - textSegments: [{ start: startOffset, end: endOffset }] + textSegments: [{ start: startOffset, end: endOffset }], }; return textNode; @@ -532,7 +545,7 @@ export class CSTParser { end: endOffset, content: this.text.slice(startOffset, endOffset), children: [], - expression: expression.trim() + expression: expression.trim(), }; return templateNode; @@ -543,7 +556,7 @@ export class CSTParser { while (this.position < this.tokens.length) { this.skipWhitespace(); - + const token = this.currentToken(); if (!token || token.tokenType !== Identifier) { break; @@ -552,7 +565,7 @@ export class CSTParser { const keyToken = this.consumeToken()!; const keyRange: SourceRange = { start: keyToken.startOffset || 0, - end: (keyToken.endOffset || 0) + 1 + end: (keyToken.endOffset || 0) + 1, }; this.skipWhitespace(); @@ -561,17 +574,19 @@ export class CSTParser { // Boolean attribute attributes.push({ key: keyToken.image, - value: [{ - id: this.generateId(), - kind: 'TEXT', - start: keyRange.start, - end: keyRange.end, - content: 'true', - children: [] - }], + value: [ + { + id: this.generateId(), + kind: 'TEXT', + start: keyRange.start, + end: keyRange.end, + content: 'true', + children: [], + }, + ], keyRange, valueRange: keyRange, - fullRange: keyRange + fullRange: keyRange, }); continue; } @@ -595,11 +610,10 @@ export class CSTParser { while (this.position < this.tokens.length) { const token = this.currentToken(); if (!token) { - break; - } + break; + } - if ((isDoubleQuote && token.tokenType === DoubleQuote) || - (!isDoubleQuote && token.tokenType === SingleQuote)) { + if ((isDoubleQuote && token.tokenType === DoubleQuote) || (!isDoubleQuote && token.tokenType === SingleQuote)) { valueEnd = token.startOffset || valueEnd; this.consumeToken(); // consume closing quote break; @@ -613,22 +627,24 @@ export class CSTParser { const textStart = token.startOffset || 0; let textEnd = (token.endOffset || 0) + 1; let textContent = token.image; - + this.consumeToken(); - + // Collect more text tokens while (this.position < this.tokens.length) { const nextToken = this.currentToken(); if (!nextToken) { break; } - - if ((isDoubleQuote && nextToken.tokenType === DoubleQuote) || - (!isDoubleQuote && nextToken.tokenType === SingleQuote) || - nextToken.tokenType === TemplateOpen) { + + if ( + (isDoubleQuote && nextToken.tokenType === DoubleQuote) || + (!isDoubleQuote && nextToken.tokenType === SingleQuote) || + nextToken.tokenType === TemplateOpen + ) { break; } - + textContent += nextToken.image; textEnd = (nextToken.endOffset || 0) + 1; this.consumeToken(); @@ -640,15 +656,15 @@ export class CSTParser { start: textStart, end: textEnd, content: textContent, - children: [] + children: [], }); } } const valueRange: SourceRange = { start: valueStart, end: valueEnd }; - const fullRange: SourceRange = { - start: keyRange.start, - end: (this.tokens[this.position - 1]?.endOffset || 0) + 1 + const fullRange: SourceRange = { + start: keyRange.start, + end: (this.tokens[this.position - 1]?.endOffset || 0) + 1, }; attributes.push({ @@ -656,7 +672,7 @@ export class CSTParser { value: valueNodes, keyRange, valueRange, - fullRange + fullRange, }); } @@ -674,7 +690,7 @@ export class CSTParser { this.processComponentsAttribute(attr.value); break; case 'unknownComponents': - const behavior = attr.value[0]?.content; + const behavior = attr.value[0]?.content; // eslint-disable-line if (behavior === 'error' || behavior === 'warning' || behavior === 'ignore') { this.context.unknownComponentBehavior = behavior; } @@ -689,8 +705,8 @@ export class CSTParser { private processComponentsAttribute(value: (ASTNode & { kind: 'TEXT' | 'TEMPLATE' })[]): void { const components = value[0]?.content || ''; - const parts = components.split(',').map(s => s.trim()); - + const parts = components.split(',').map((s) => s.trim()); + for (const part of parts) { if (part.startsWith('+')) { this.context.enabledComponents.add(part.slice(1)); @@ -722,9 +738,9 @@ export function parseExtendedPoml(text: string, context: Partial = sourcePath: '', enabledComponents: new Set(), unknownComponentBehavior: 'warning', - ...context + ...context, }; const parser = new CSTParser(fullContext); return parser.parse(text); -} \ No newline at end of file +} diff --git a/packages/poml/reader/index.tsx b/packages/poml/next/index.tsx similarity index 100% rename from packages/poml/reader/index.tsx rename to packages/poml/next/index.tsx diff --git a/packages/poml/reader/lexer.ts b/packages/poml/next/lexer.ts similarity index 92% rename from packages/poml/reader/lexer.ts rename to packages/poml/next/lexer.ts index cfa886b7..a646defd 100644 --- a/packages/poml/reader/lexer.ts +++ b/packages/poml/next/lexer.ts @@ -24,15 +24,16 @@ export const Backslash = createToken({ name: 'Backslash', pattern: /\\/ }); */ export const Identifier = createToken({ name: 'Identifier', - pattern: /[a-zA-Z_][a-zA-Z0-9_-]*/ + pattern: /[a-zA-Z_][a-zA-Z0-9_-]*/, }); export const Whitespace = createToken({ name: 'Whitespace', pattern: /[ \t\r\n]+/, - line_breaks: true + line_breaks: true, }); +/* eslint-disable no-irregular-whitespace */ /* Catch-all for arbitrary text content - Match any char except: < — starts a tag @@ -43,8 +44,9 @@ export const Whitespace = createToken({ export const TextContent = createToken({ name: 'TextContent', pattern: /(?:[^<"'{}]|{(?!{)|}(?!}))+/, - line_breaks: true + line_breaks: true, }); +/* eslint-enable no-irregular-whitespace */ // Define token order - more specific patterns first export const allTokens = [ @@ -61,7 +63,7 @@ export const allTokens = [ Backslash, Identifier, Whitespace, - TextContent + TextContent, ]; // Extended POML Lexer class @@ -82,7 +84,7 @@ export class ExtendedPomlLexer { return { tokens: lexingResult.tokens, errors: lexingResult.errors, - groups: lexingResult.groups + groups: lexingResult.groups, }; } } diff --git a/packages/poml/reader/meta.ts b/packages/poml/next/meta.ts similarity index 100% rename from packages/poml/reader/meta.ts rename to packages/poml/next/meta.ts diff --git a/packages/poml/reader/poml.tsx b/packages/poml/next/poml.tsx similarity index 100% rename from packages/poml/reader/poml.tsx rename to packages/poml/next/poml.tsx diff --git a/packages/poml/reader/segment.ts b/packages/poml/next/segment.ts similarity index 91% rename from packages/poml/reader/segment.ts rename to packages/poml/next/segment.ts index 2218c086..440ea74a 100644 --- a/packages/poml/reader/segment.ts +++ b/packages/poml/next/segment.ts @@ -33,28 +33,33 @@ class Segmenter { private isValidPomlTag(tagName: string): boolean { const validTags = new Set(); - + for (const doc of componentDocs) { if (doc.name) { validTags.add(doc.name.toLowerCase()); - validTags.add(doc.name.toLowerCase().replace(/([A-Z])/g, '-$1').toLowerCase()); + validTags.add( + doc.name + .toLowerCase() + .replace(/([A-Z])/g, '-$1') + .toLowerCase(), + ); } } - + validTags.add('poml'); validTags.add('text'); validTags.add('meta'); - + return validTags.has(tagName.toLowerCase()); } private parseSegments(text: string, start: number = 0, parent?: Segment): Segment[] { const segments: Segment[] = []; let currentPos = start; - + while (currentPos < text.length) { const nextOpenTag = text.indexOf('<', currentPos); - + if (nextOpenTag === -1) { if (currentPos < text.length) { const textContent = text.substring(currentPos); @@ -67,13 +72,13 @@ class Segmenter { content: textContent, path: this.sourcePath, parent, - children: [] + children: [], }); } } break; } - + if (nextOpenTag > currentPos) { const textContent = text.substring(currentPos, nextOpenTag); if (textContent.trim()) { @@ -85,46 +90,46 @@ class Segmenter { content: textContent, path: this.sourcePath, parent, - children: [] + children: [], }); } } - + const tagEndPos = text.indexOf('>', nextOpenTag); if (tagEndPos === -1) { currentPos = nextOpenTag + 1; continue; } - + const tagContent = text.substring(nextOpenTag + 1, tagEndPos); const tagName = tagContent.trim().split(/\s+/)[0]; - + if (tagName.startsWith('/')) { currentPos = tagEndPos + 1; continue; } - + if (tagContent.endsWith('/')) { currentPos = tagEndPos + 1; continue; } - + if (!this.isValidPomlTag(tagName)) { currentPos = tagEndPos + 1; continue; } - + const closingTag = ``; const closingTagPos = this.findClosingTag(text, tagName, tagEndPos + 1); - + if (closingTagPos === -1) { currentPos = tagEndPos + 1; continue; } - + const segmentContent = text.substring(nextOpenTag, closingTagPos + closingTag.length); const innerContent = text.substring(tagEndPos + 1, closingTagPos); - + const segment: Segment = { id: this.generateId(), kind: tagName.toLowerCase() === 'meta' ? 'META' : 'POML', @@ -134,60 +139,60 @@ class Segmenter { path: this.sourcePath, parent, children: [], - tagName: tagName.toLowerCase() + tagName: tagName.toLowerCase(), }; - + if (tagName.toLowerCase() === 'text') { segment.children = this.parseSegments(innerContent, tagEndPos + 1, segment); } else if (tagName.toLowerCase() !== 'meta') { const childSegments = this.parseSegments(innerContent, tagEndPos + 1, segment); segment.children = childSegments; } - + segments.push(segment); currentPos = closingTagPos + closingTag.length; } - + return segments; } private findClosingTag(text: string, tagName: string, startPos: number): number { let depth = 1; let pos = startPos; - + while (pos < text.length && depth > 0) { const nextTag = text.indexOf('<', pos); if (nextTag === -1) { break; } - + const tagEndPos = text.indexOf('>', nextTag); if (tagEndPos === -1) { break; } - + const tagContent = text.substring(nextTag + 1, tagEndPos); const currentTagName = tagContent.trim().split(/\s+/)[0]; - + if (currentTagName === tagName) { depth++; } else if (currentTagName === `/${tagName}`) { depth--; } - + pos = tagEndPos + 1; } - - return depth === 0 ? pos - (``.length) : -1; + + return depth === 0 ? pos - ``.length : -1; } public createSegments(content: string): Segment { const rootSegments = this.parseSegments(content); - + if (rootSegments.length === 1 && rootSegments[0].kind === 'POML') { return rootSegments[0]; } - + if (rootSegments.length === 0) { return { id: this.generateId(), @@ -197,10 +202,10 @@ class Segmenter { content: content, path: this.sourcePath, children: [], - parent: undefined + parent: undefined, }; } - + const rootSegment: Segment = { id: this.generateId(), kind: 'TEXT', @@ -209,13 +214,13 @@ class Segmenter { content: content, path: this.sourcePath, children: rootSegments, - parent: undefined + parent: undefined, }; - - rootSegments.forEach(segment => { + + rootSegments.forEach((segment) => { segment.parent = rootSegment; }); - + return rootSegment; } } diff --git a/packages/poml/reader/text.tsx b/packages/poml/next/text.tsx similarity index 100% rename from packages/poml/reader/text.tsx rename to packages/poml/next/text.tsx diff --git a/packages/poml/reader/tokenizer.ts b/packages/poml/next/tokenizer.ts similarity index 82% rename from packages/poml/reader/tokenizer.ts rename to packages/poml/next/tokenizer.ts index a8e166d1..ce1930b8 100644 --- a/packages/poml/reader/tokenizer.ts +++ b/packages/poml/next/tokenizer.ts @@ -16,14 +16,14 @@ export class Tokenizer { tokenize(): Token[] { const tokens: Token[] = []; - + while (this.position < this.input.length) { // Check for template variables first if (this.peek() === '{' && this.peek(1) === '{') { tokens.push(this.readTemplateVariable()); continue; } - + // Check for XML tags if (this.peek() === '<') { const tagToken = this.readTag(); @@ -32,14 +32,14 @@ export class Tokenizer { continue; } } - + // Read text content const textToken = this.readText(); if (textToken.value.length > 0) { tokens.push(textToken); } } - + return tokens; } @@ -55,69 +55,71 @@ export class Tokenizer { const start = this.position; this.advance(); // { this.advance(); // { - + while (this.position < this.input.length && !(this.peek() === '}' && this.peek(1) === '}')) { this.advance(); } - + if (this.peek() === '}' && this.peek(1) === '}') { this.advance(); // } this.advance(); // } } - + return { type: 'TEMPLATE_VAR', value: this.input.substring(start, this.position), start, - end: this.position + end: this.position, }; } private readTag(): Token | null { const start = this.position; this.advance(); // < - + // Skip whitespace while (this.peek() === ' ' || this.peek() === '\t' || this.peek() === '\n') { this.advance(); } - + // Check for closing tag const isClosing = this.peek() === '/'; if (isClosing) { this.advance(); } - + // Read tag name let tagName = ''; - while (this.position < this.input.length && - this.peek() !== '>' && - this.peek() !== ' ' && - this.peek() !== '\t' && - this.peek() !== '\n') { + while ( + this.position < this.input.length && + this.peek() !== '>' && + this.peek() !== ' ' && + this.peek() !== '\t' && + this.peek() !== '\n' + ) { tagName += this.advance(); } - + // Skip attributes for now (will be parsed separately) while (this.position < this.input.length && this.peek() !== '>') { this.advance(); } - + if (this.peek() === '>') { this.advance(); // > - + // Check if self-closing const content = this.input.substring(start, this.position); const isSelfClosing = content.endsWith('/>'); - + return { - type: isSelfClosing ? 'TAG_SELF_CLOSE' : (isClosing ? 'TAG_CLOSE' : 'TAG_OPEN'), + type: isSelfClosing ? 'TAG_SELF_CLOSE' : isClosing ? 'TAG_CLOSE' : 'TAG_OPEN', value: content, start, - end: this.position + end: this.position, }; } - + // Invalid tag, backtrack this.position = start + 1; return null; @@ -125,18 +127,16 @@ export class Tokenizer { private readText(): Token { const start = this.position; - - while (this.position < this.input.length && - this.peek() !== '<' && - !(this.peek() === '{' && this.peek(1) === '{')) { + + while (this.position < this.input.length && this.peek() !== '<' && !(this.peek() === '{' && this.peek(1) === '{')) { this.advance(); } - + return { type: 'TEXT', value: this.input.substring(start, this.position), start, - end: this.position + end: this.position, }; } } diff --git a/packages/poml/tests/reader/ast.test.ts b/packages/poml/tests/reader/ast.test.ts index 9921e210..4b5819db 100644 --- a/packages/poml/tests/reader/ast.test.ts +++ b/packages/poml/tests/reader/ast.test.ts @@ -1,11 +1,11 @@ import { describe, expect, test } from '@jest/globals'; -import { parseAST, ASTNode } from 'poml/reader/ast'; +import { parseAST, ASTNode } from 'poml/next/ast'; describe('parseAST', () => { test('pure text content', () => { const content = 'This is pure text content with no POML tags.'; const ast = parseAST(content); - + expect(ast.kind).toBe('TEXT'); expect(ast.content).toBe(content); expect(ast.start).toBe(0); @@ -16,7 +16,7 @@ describe('parseAST', () => { test('single POML tag', () => { const content = 'Analyze the data'; const ast = parseAST(content); - + expect(ast.kind).toBe('POML'); expect(ast.tagName).toBe('task'); expect(ast.content).toBe(content); @@ -40,23 +40,23 @@ Here are some key points to consider: - Business impact`; const ast = parseAST(content); - + expect(ast.kind).toBe('TEXT'); expect(ast.children).toHaveLength(4); - + const children = ast.children; expect(children[0].kind).toBe('TEXT'); expect(children[0].content).toContain('# My Analysis Document'); - + expect(children[1].kind).toBe('POML'); expect(children[1].tagName).toBe('task'); expect(children[1].content).toBe(` Analyze the following data and provide insights. `); - + expect(children[2].kind).toBe('TEXT'); expect(children[2].content).toContain('Here are some key points'); - + expect(children[3].kind).toBe('TEXT'); expect(children[3].content).toContain('- Data quality'); }); @@ -70,7 +70,7 @@ Here are some key points to consider: `; const ast = parseAST(content); - + expect(ast.kind).toBe('POML'); expect(ast.tagName).toBe('examples'); expect(ast.children).toHaveLength(0); @@ -88,7 +88,7 @@ Here are some key points to consider: test('text in text in POML', () => { const content = `This is a text with nested text content.`; const ast = parseAST(content); - expect(ast.kind).toBe('POML'); + expect(ast.kind).toBe('POML'); expect(ast.tagName).toBe('poml'); expect(ast.children).toHaveLength(1); const textNode = ast.children[0]; @@ -122,18 +122,20 @@ Here are some key points to consider: `; const ast = parseAST(content); - + expect(ast.kind).toBe('POML'); expect(ast.tagName).toBe('poml'); expect(ast.children).toHaveLength(4); - - const textNode = ast.children.find(c => c.kind === 'POML' && c.tagName === 'text'); + + const textNode = ast.children.find((c) => c.kind === 'POML' && c.tagName === 'text'); expect(textNode).toBeDefined(); expect(textNode!.children).toHaveLength(3); - - const nestedCpNode = textNode!.children.find(c => c.kind === 'POML' && c.tagName === 'cp'); + + const nestedCpNode = textNode!.children.find((c) => c.kind === 'POML' && c.tagName === 'cp'); expect(nestedCpNode).toBeDefined(); - expect(nestedCpNode!.content).toBe('This is a nested POML component that will be processed as POML.'); + expect(nestedCpNode!.content).toBe( + 'This is a nested POML component that will be processed as POML.', + ); }); test('meta tags', () => { @@ -146,16 +148,16 @@ Here are some key points to consider: Complete the analysis`; const ast = parseAST(content); - + expect(ast.kind).toBe('TEXT'); expect(ast.children).toHaveLength(3); - - const metaNode = ast.children.find(c => c.kind === 'META'); + + const metaNode = ast.children.find((c) => c.kind === 'META'); expect(metaNode).toBeDefined(); expect(metaNode!.tagName).toBe('meta'); expect(metaNode!.children).toHaveLength(0); - - const taskNode = ast.children.find(c => c.kind === 'POML' && c.tagName === 'task'); + + const taskNode = ast.children.find((c) => c.kind === 'POML' && c.tagName === 'task'); expect(taskNode).toBeDefined(); }); @@ -165,15 +167,15 @@ Here are some key points to consider: This should also be ignored`; const ast = parseAST(content); - + expect(ast.kind).toBe('TEXT'); expect(ast.children).toHaveLength(3); - - const taskNode = ast.children.find(c => c.kind === 'POML'); + + const taskNode = ast.children.find((c) => c.kind === 'POML'); expect(taskNode).toBeDefined(); expect(taskNode!.tagName).toBe('task'); - - const textNodes = ast.children.filter(c => c.kind === 'TEXT'); + + const textNodes = ast.children.filter((c) => c.kind === 'TEXT'); expect(textNodes).toHaveLength(2); expect(textNodes[0].content).toContain('This should be ignored'); expect(textNodes[1].content).toContain('This should also be ignored'); @@ -186,11 +188,11 @@ Here are some key points to consider: Valid hint`; const ast = parseAST(content); - + expect(ast.kind).toBe('TEXT'); expect(ast.children).toHaveLength(4); - - const pomlNodes = ast.children.filter(c => c.kind === 'POML'); + + const pomlNodes = ast.children.filter((c) => c.kind === 'POML'); expect(pomlNodes).toHaveLength(3); expect(pomlNodes[0].tagName).toBe('task'); expect(pomlNodes[2].tagName).toBe('hint'); @@ -202,15 +204,15 @@ Here are some key points to consider: This has no closing tag`; const ast = parseAST(content); - + expect(ast.kind).toBe('TEXT'); expect(ast.children).toHaveLength(3); - - const hintNode = ast.children.find(c => c.kind === 'POML' && c.tagName === 'hint'); + + const hintNode = ast.children.find((c) => c.kind === 'POML' && c.tagName === 'hint'); expect(hintNode).toBeDefined(); expect(hintNode!.content).toBe('Complete hint'); - - const textNodes = ast.children.filter(c => c.kind === 'TEXT'); + + const textNodes = ast.children.filter((c) => c.kind === 'TEXT'); expect(textNodes).toHaveLength(2); expect(textNodes[0].content).toBe('Incomplete tag\n'); expect(textNodes[1].content).toBe('\nThis has no closing tag'); @@ -219,7 +221,7 @@ Here are some key points to consider: test('malformed POML tags are ignored', () => { const content = `Valid task`; const ast = parseAST(content); - + expect(ast.kind).toBe('TEXT'); expect(ast.children).toHaveLength(0); }); @@ -227,7 +229,7 @@ Here are some key points to consider: test('empty content', () => { const content = ''; const ast = parseAST(content); - + expect(ast.kind).toBe('TEXT'); expect(ast.content).toBe(''); expect(ast.children).toHaveLength(0); @@ -236,7 +238,7 @@ Here are some key points to consider: test('whitespace-only content', () => { const content = ' \n\n\t \n '; const ast = parseAST(content); - + expect(ast.kind).toBe('TEXT'); expect(ast.content).toBe(content); expect(ast.children).toHaveLength(0); @@ -248,11 +250,11 @@ Here are some key points to consider: User message`; const ast = parseAST(content); - + expect(ast.kind).toBe('TEXT'); expect(ast.children).toHaveLength(4); - - const pomlNodes = ast.children.filter(c => c.kind === 'POML'); + + const pomlNodes = ast.children.filter((c) => c.kind === 'POML'); expect(pomlNodes).toHaveLength(3); expect(pomlNodes[0].tagName).toBe('output-format'); expect(pomlNodes[1].tagName).toBe('system-msg'); @@ -269,21 +271,21 @@ Here are some key points to consider: `; const ast = parseAST(content); - + const taskNode = ast; expect(taskNode.kind).toBe('POML'); expect(taskNode.tagName).toBe('task'); expect(taskNode.parent).toBeUndefined(); - - const hintNode = taskNode.children.find(c => c.kind === 'POML' && c.tagName === 'hint'); + + const hintNode = taskNode.children.find((c) => c.kind === 'POML' && c.tagName === 'hint'); expect(hintNode).toBeDefined(); expect(hintNode!.parent).toBe(taskNode); - - const examplesNode = taskNode.children.find(c => c.kind === 'POML' && c.tagName === 'examples'); + + const examplesNode = taskNode.children.find((c) => c.kind === 'POML' && c.tagName === 'examples'); expect(examplesNode).toBeDefined(); expect(examplesNode!.parent).toBe(taskNode); - - const exampleNode = examplesNode!.children.find(c => c.kind === 'POML' && c.tagName === 'example'); + + const exampleNode = examplesNode!.children.find((c) => c.kind === 'POML' && c.tagName === 'example'); expect(exampleNode).toBeDefined(); expect(exampleNode!.parent).toBe(examplesNode); }); @@ -296,19 +298,19 @@ Here are some key points to consider: const ast = parseAST(content); expect(ast.kind).toBe('TEXT'); expect(ast.children).toHaveLength(5); - + function collectAllNodes(node: ASTNode): ASTNode[] { const all = [node]; - node.children.forEach(child => { + node.children.forEach((child) => { all.push(...collectAllNodes(child)); }); return all; } - + const allNodes = collectAllNodes(ast); - const ids = allNodes.map(s => s.id); + const ids = allNodes.map((s) => s.id); const uniqueIds = new Set(ids); - + expect(uniqueIds.size).toBe(ids.length); }); @@ -338,41 +340,41 @@ There can be some intervening text here as well.

    POML elements do not necessarily reside in a poml element.

    `; const ast = parseAST(content); - + expect(ast.kind).toBe('TEXT'); expect(ast.children).toHaveLength(5); - - const firstPomlNode = ast.children.find(c => c.kind === 'POML' && c.tagName === 'poml'); + + const firstPomlNode = ast.children.find((c) => c.kind === 'POML' && c.tagName === 'poml'); expect(firstPomlNode).toBeDefined(); expect(firstPomlNode!.children).toHaveLength(4); - - const textNode = firstPomlNode!.children.find(c => c.kind === 'POML' && c.tagName === 'text'); + + const textNode = firstPomlNode!.children.find((c) => c.kind === 'POML' && c.tagName === 'text'); expect(textNode).toBeDefined(); expect(textNode!.children).toHaveLength(3); - - const cpNode = textNode!.children.find(c => c.kind === 'POML' && c.tagName === 'cp'); + + const cpNode = textNode!.children.find((c) => c.kind === 'POML' && c.tagName === 'cp'); expect(cpNode).toBeDefined(); - - const secondPomlNode = ast.children.filter(c => c.kind === 'POML' && c.tagName === 'poml')[1]; + + const secondPomlNode = ast.children.filter((c) => c.kind === 'POML' && c.tagName === 'poml')[1]; expect(secondPomlNode).toBeDefined(); const lineBreakNode = ast.children[3]; expect(lineBreakNode.kind).toBe('TEXT'); expect(lineBreakNode.content).toBe('\n\n'); - const pNode = ast.children.find(c => c.kind === 'POML' && c.tagName === 'p'); + const pNode = ast.children.find((c) => c.kind === 'POML' && c.tagName === 'p'); expect(pNode).toBeDefined(); }); test('template variables in content', () => { const content = `Process {{variable}} with {{another_variable}}`; const ast = parseAST(content); - + expect(ast.kind).toBe('POML'); expect(ast.tagName).toBe('task'); expect(ast.children).toHaveLength(4); // text, template, text, template - - const templateNodes = ast.children.filter(c => c.kind === 'TEMPLATE'); + + const templateNodes = ast.children.filter((c) => c.kind === 'TEMPLATE'); expect(templateNodes).toHaveLength(2); expect(templateNodes[0].expression).toBe('variable'); expect(templateNodes[1].expression).toBe('another_variable'); @@ -381,7 +383,7 @@ There can be some intervening text here as well. test('template variables in text nodes are treated as literal', () => { const content = `Variables like {{this}} are shown as-is`; const ast = parseAST(content); - + expect(ast.kind).toBe('TEXT'); expect(ast.content).toBe(content); expect(ast.children).toHaveLength(0); @@ -390,11 +392,11 @@ There can be some intervening text here as well. test('template variables in attribute values', () => { const content = `Content`; const ast = parseAST(content); - + expect(ast.kind).toBe('POML'); expect(ast.tagName).toBe('task'); expect(ast.attributes).toHaveLength(1); - + const attr = ast.attributes![0]; expect(attr.key).toBe('caption'); expect(attr.value).toHaveLength(2); // text + template @@ -407,10 +409,10 @@ There can be some intervening text here as well. test('mixed template variables and text in attributes', () => { const content = `Content`; const ast = parseAST(content); - + expect(ast.kind).toBe('POML'); expect(ast.attributes).toHaveLength(1); - + const attr = ast.attributes![0]; expect(attr.value).toHaveLength(4); // text, template, text, template expect(attr.value[0].content).toBe('Hello '); @@ -418,4 +420,4 @@ There can be some intervening text here as well. expect(attr.value[2].content).toBe(', process '); expect(attr.value[3].expression).toBe('data'); }); -}); \ No newline at end of file +}); diff --git a/packages/poml/tests/reader/cst.test.ts b/packages/poml/tests/reader/cst.test.ts index 1e9e5ad6..4406f53f 100644 --- a/packages/poml/tests/reader/cst.test.ts +++ b/packages/poml/tests/reader/cst.test.ts @@ -1,11 +1,11 @@ import { describe, expect, test } from '@jest/globals'; -import { parseExtendedPoml, ASTNode } from 'poml/reader/cst'; +import { parseExtendedPoml, ASTNode } from 'poml/next/cst'; describe('Extended POML CST Parser', () => { test('parses pure text content', () => { const input = 'This is plain text content.'; const result = parseExtendedPoml(input); - + expect(result.kind).toBe('TEXT'); expect(result.content).toBe(input); expect(result.children).toHaveLength(0); @@ -14,10 +14,10 @@ describe('Extended POML CST Parser', () => { test('parses simple POML element', () => { const input = 'Analyze the data'; const result = parseExtendedPoml(input); - + expect(result.kind).toBe('TEXT'); expect(result.children).toHaveLength(1); - + const taskNode = result.children[0]; expect(taskNode.kind).toBe('POML'); expect(taskNode.tagName).toBe('task'); @@ -35,14 +35,14 @@ This is regular text.
    More text here.`; - + const result = parseExtendedPoml(input); - + expect(result.kind).toBe('TEXT'); expect(result.children.length).toBeGreaterThan(1); - + // Should have text nodes and POML nodes - const pomlNodes = result.children.filter(child => child.kind === 'POML'); + const pomlNodes = result.children.filter((child) => child.kind === 'POML'); expect(pomlNodes).toHaveLength(1); expect(pomlNodes[0].tagName).toBe('task'); }); @@ -50,7 +50,7 @@ More text here.`; test('parses self-closing elements', () => { const input = ''; const result = parseExtendedPoml(input); - + expect(result.children).toHaveLength(1); const metaNode = result.children[0]; expect(metaNode.kind).toBe('META'); @@ -62,9 +62,9 @@ More text here.`; test('parses template expressions', () => { const input = 'Hello {{name}}!'; const result = parseExtendedPoml(input); - + expect(result.children.length).toBeGreaterThan(1); - const templateNode = result.children.find(child => child.kind === 'TEMPLATE'); + const templateNode = result.children.find((child) => child.kind === 'TEMPLATE'); expect(templateNode).toBeDefined(); expect(templateNode!.expression).toBe('name'); }); @@ -72,16 +72,16 @@ More text here.`; test('parses attributes with mixed content', () => { const input = '

    Content

    '; const result = parseExtendedPoml(input); - - const pNode = result.children.find(child => child.kind === 'POML'); + + const pNode = result.children.find((child) => child.kind === 'POML'); expect(pNode).toBeDefined(); expect(pNode!.attributes).toHaveLength(2); - - const classAttr = pNode!.attributes!.find(attr => attr.key === 'class'); + + const classAttr = pNode!.attributes!.find((attr) => attr.key === 'class'); expect(classAttr).toBeDefined(); expect(classAttr!.value[0].content).toBe('header'); - - const idAttr = pNode!.attributes!.find(attr => attr.key === 'id'); + + const idAttr = pNode!.attributes!.find((attr) => attr.key === 'id'); expect(idAttr).toBeDefined(); expect(idAttr!.value[0].kind).toBe('TEMPLATE'); }); @@ -92,21 +92,21 @@ This is **markdown** content. This is nested POML More markdown here.
    `; - + const result = parseExtendedPoml(input); - const textNode = result.children.find(child => child.kind === 'POML' && child.tagName === 'text'); - + const textNode = result.children.find((child) => child.kind === 'POML' && child.tagName === 'text'); + expect(textNode).toBeDefined(); expect(textNode!.children.length).toBeGreaterThan(1); - - const cpNode = textNode!.children.find(child => child.kind === 'POML' && child.tagName === 'cp'); + + const cpNode = textNode!.children.find((child) => child.kind === 'POML' && child.tagName === 'cp'); expect(cpNode).toBeDefined(); }); test('preserves source position information', () => { const input = 'Test'; const result = parseExtendedPoml(input); - + const taskNode = result.children[0]; expect(taskNode.start).toBe(0); expect(taskNode.end).toBe(input.length); @@ -118,12 +118,12 @@ More markdown here. test('handles unknown components gracefully', () => { const input = 'This should be treated as text'; - + // Should not throw by default (warning behavior) const result = parseExtendedPoml(input); expect(result).toBeDefined(); - + // Should treat unknown tag as text content expect(result.children.length).toBeGreaterThan(0); }); -}); \ No newline at end of file +}); diff --git a/packages/poml/tests/reader/lexer.test.ts b/packages/poml/tests/reader/lexer.test.ts index bbd8f976..432c06ee 100644 --- a/packages/poml/tests/reader/lexer.test.ts +++ b/packages/poml/tests/reader/lexer.test.ts @@ -14,19 +14,19 @@ import { Backslash, Identifier, Whitespace, - TextContent -} from 'poml/reader/lexer'; + TextContent, +} from 'poml/next/lexer'; // Helper function to extract token images function tokenImages(input: string): string[] { const result = extendedPomlLexer.tokenize(input); - return result.tokens.map(t => t.image); + return result.tokens.map((t) => t.image); } // Helper function to extract token types function tokenTypes(input: string): any[] { const result = extendedPomlLexer.tokenize(input); - return result.tokens.map(t => t.tokenType); + return result.tokens.map((t) => t.tokenType); } // Helper function to get full tokenization result @@ -85,7 +85,7 @@ describe('Edge Cases', () => { 'poml', '>', 'ghi', - '"' + '"', ]); }); @@ -103,7 +103,7 @@ describe('Edge Cases', () => { 'ghi', '' + '>', ]); }); @@ -121,7 +121,7 @@ describe('Edge Cases', () => { '内容', '<', ' ', - '标签>' + '标签>', ]); }); @@ -143,7 +143,7 @@ describe('Edge Cases', () => { '"', 'test', '"', - '>' + '>', ]); }); @@ -161,7 +161,7 @@ describe('Edge Cases', () => { '"', ' ', 'quotes', - '"' + '"', ]); }); @@ -191,16 +191,16 @@ describe('Edge Cases', () => {
  • {{name}}
  • -{{/each}}` +{{/each}}`, ]; - realWorldTests.forEach(test => { + realWorldTests.forEach((test) => { const result = tokenize(test); expect(result.errors).toHaveLength(0); expect(result.tokens.length).toBeGreaterThan(0); // Verify position integrity - result.tokens.forEach(token => { + result.tokens.forEach((token) => { expect(token.startOffset).toBeGreaterThanOrEqual(0); expect(token.endOffset).toBeGreaterThanOrEqual(token.startOffset!); }); @@ -216,14 +216,14 @@ describe('Edge Cases', () => { 'first=one second=two', '=standalone', 'text=content', - 'a=b=c' + 'a=b=c', ]; - equalsTests.forEach(test => { + equalsTests.forEach((test) => { const result = tokenize(test); expect(result.errors).toHaveLength(0); - const equalsTokens = result.tokens.filter(t => t.tokenType.name === 'Equals'); + const equalsTokens = result.tokens.filter((t) => t.tokenType.name === 'Equals'); expect(equalsTokens.length).toBeGreaterThan(0); }); }); @@ -231,7 +231,7 @@ describe('Edge Cases', () => { test('should handle edge cases with zero-length matches', () => { const edgeCases = ['', ' ', '\n', '\t', '\r', '{{}}', '', '<>', '""', "''", '\\']; - edgeCases.forEach(test => { + edgeCases.forEach((test) => { const result = tokenize(test); expect(result.errors).toHaveLength(0); @@ -295,7 +295,7 @@ line2 line3`; const result = tokenize(input); - const tagToken = result.tokens.find(t => t.tokenType === TagOpen); + const tagToken = result.tokens.find((t) => t.tokenType === TagOpen); expect(tagToken).toBeDefined(); expect(tagToken!.startLine).toBe(2); expect(tagToken!.startColumn).toBe(7); // After "line2 " @@ -368,7 +368,7 @@ Analyze data 'content', '' + '>', ]); }); @@ -385,7 +385,7 @@ Analyze data '}}', '/file.txt', '"', - '>' + '>', ]); }); }); @@ -491,18 +491,7 @@ describe('Unicode and Special Characters', () => { test('should handle unicode', () => { expect(tokenImages('<こんにちは>')).toEqual(['<', 'こんにちは>']); expect(tokenImages('{{你好}}')).toEqual(['{{', '你好', '}}']); - expect(tokenImages('')).toEqual([ - '<', - 'tag', - ' ', - 'attr', - '=', - '"', - 'caf', - 'é', - '"', - '>' - ]); + expect(tokenImages('')).toEqual(['<', 'tag', ' ', 'attr', '=', '"', 'caf', 'é', '"', '>']); }); test('should maintain lexer stability with all edge cases', () => { @@ -514,7 +503,7 @@ describe('Unicode and Special Characters', () => { expect(result.tokens.length).toBeGreaterThan(0); // Verify token integrity - result.tokens.forEach(token => { + result.tokens.forEach((token) => { expect(token.startOffset).toBeGreaterThanOrEqual(0); if (token.endOffset !== undefined) { expect(token.endOffset).toBeGreaterThanOrEqual(token.startOffset); @@ -566,31 +555,14 @@ describe('Malformed Patterns', () => { test('should handle nested malformed patterns', () => { expect(tokenImages('')).toEqual(['']); expect(tokenImages('')).toEqual(['']); - expect(tokenImages('more{{ content')).toEqual([ '', @@ -649,7 +621,7 @@ describe('Malformed Patterns', () => { 'more', '{{', ' ', - 'content' + 'content', ]); expect(tokenImages("\"quoted textend")).toEqual([ '"', @@ -665,7 +637,7 @@ describe('Malformed Patterns', () => { 'mixed', "'", '>', - 'end' + 'end', ]); }); @@ -693,11 +665,11 @@ describe('Position Tracking Accuracy', () => { final line`; const result = tokenize(input); - const tagOpenToken = result.tokens.find(t => t.image === '<' && t.startLine === 2); + const tagOpenToken = result.tokens.find((t) => t.image === '<' && t.startLine === 2); expect(tagOpenToken).toBeDefined(); expect(tagOpenToken!.startColumn).toBe(1); - const variableToken = result.tokens.find(t => t.image === 'variable'); + const variableToken = result.tokens.find((t) => t.image === 'variable'); expect(variableToken).toBeDefined(); expect(variableToken!.startLine).toBe(3); }); @@ -707,7 +679,7 @@ final line`; const result = tokenize(input); expect(result.tokens.length).toBeGreaterThan(0); - result.tokens.forEach(token => { + result.tokens.forEach((token) => { expect(token.startOffset).toBeGreaterThanOrEqual(0); expect(token.endOffset).toBeGreaterThanOrEqual(token.startOffset!); expect(token.startLine).toBeGreaterThanOrEqual(1); @@ -720,7 +692,7 @@ final line`; const result = tokenize(input); // Verify all tokens have valid positions - result.tokens.forEach(token => { + result.tokens.forEach((token) => { expect(token.startOffset).toBeGreaterThanOrEqual(0); expect(token.endOffset).toBeGreaterThanOrEqual(token.startOffset!); expect(token.startLine).toBeGreaterThanOrEqual(1); @@ -733,8 +705,8 @@ final line`; const result = tokenize(input); // Find tokens and verify their positions make sense - const tagOpen = result.tokens.find(t => t.image === '<' && t.startLine === 1); - const innerOpen = result.tokens.find(t => t.image === '<' && t.startLine === 2); + const tagOpen = result.tokens.find((t) => t.image === '<' && t.startLine === 1); + const innerOpen = result.tokens.find((t) => t.image === '<' && t.startLine === 2); expect(tagOpen).toBeDefined(); expect(innerOpen).toBeDefined(); @@ -751,7 +723,7 @@ final line`; // Verify complete coverage let expectedOffset = 0; - sortedTokens.forEach(token => { + sortedTokens.forEach((token) => { expect(token.startOffset).toBeGreaterThanOrEqual(expectedOffset); expectedOffset = token.endOffset! + 1; }); @@ -768,7 +740,7 @@ comment --> more text`; const result = tokenize(input); - const commentToken = result.tokens.find(t => t.tokenType.name === 'Comment'); + const commentToken = result.tokens.find((t) => t.tokenType.name === 'Comment'); expect(commentToken).toBeDefined(); expect(commentToken!.startLine).toBe(2); @@ -780,11 +752,11 @@ more text`; const result = tokenize(input); // Check that line numbers increase correctly - const lines = new Set(result.tokens.map(t => t.startLine)); + const lines = new Set(result.tokens.map((t) => t.startLine)); expect(lines.size).toBeGreaterThan(1); // Verify positions are sequential - result.tokens.forEach(token => { + result.tokens.forEach((token) => { expect(token.startOffset).toBeGreaterThanOrEqual(0); expect(token.endOffset).toBeGreaterThanOrEqual(token.startOffset!); }); @@ -862,10 +834,10 @@ describe('Performance and Stress Tests', () => { '<'.repeat(1000) + '>', '"'.repeat(2000), '', - Array(1000).fill('{{}}').join('') + Array(1000).fill('{{}}').join(''), ]; - backtrackingTests.forEach(test => { + backtrackingTests.forEach((test) => { const start = performance.now(); const result = tokenize(test); const end = performance.now(); @@ -879,7 +851,7 @@ describe('Performance and Stress Tests', () => { const sizes = [1000, 5000, 10000, 20000]; const times: number[] = []; - sizes.forEach(size => { + sizes.forEach((size) => { const content = 'x'.repeat(size); const start = performance.now(); tokenize(content); @@ -914,7 +886,7 @@ describe('Error Recovery', () => { expect(result.errors).toHaveLength(0); expect(result.tokens.length).toBeGreaterThan(0); - const types = result.tokens.map(t => t.tokenType); + const types = result.tokens.map((t) => t.tokenType); expect(types).toContain(Identifier); expect(types).toContain(TemplateOpen); }); @@ -930,7 +902,7 @@ describe('Error Recovery', () => { expect(result.tokens.length).toBeGreaterThan(0); // Should tokenize the valid parts - const images = result.tokens.map(t => t.image); + const images = result.tokens.map((t) => t.image); expect(images).toContain('<'); expect(images).toContain('valid'); expect(images).toContain('>'); @@ -941,7 +913,7 @@ describe('Error Recovery', () => { const input = 'text with @#$%^&*()[]{}|;:,.<>?/~`'; const result = tokenize(input); expect(result.errors).toHaveLength(0); - const images = result.tokens.map(t => t.image); + const images = result.tokens.map((t) => t.image); expect(images).toEqual(['text', ' ', 'with', ' ', '@#$%^&*()[]{}|;:,.', '<', '>', '?/~`']); }); }); From d8b272ebef2f8eab8174e075aa12d367f7716fa9 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Tue, 26 Aug 2025 15:44:19 +0800 Subject: [PATCH 20/76] error and source --- packages/poml/next/context.ts | 28 ++- packages/poml/next/error.ts | 422 ++++++++++++++++++++++++++++++++++ packages/poml/next/source.ts | 115 +++++++++ packages/poml/next/types.ts | 48 ++++ 4 files changed, 607 insertions(+), 6 deletions(-) create mode 100644 packages/poml/next/error.ts create mode 100644 packages/poml/next/source.ts create mode 100644 packages/poml/next/types.ts diff --git a/packages/poml/next/context.ts b/packages/poml/next/context.ts index 92939bda..bee95045 100644 --- a/packages/poml/next/context.ts +++ b/packages/poml/next/context.ts @@ -1,17 +1,33 @@ -export class ContextEvaluator { - private contextStore: { [key: string]: any } = {}; - private stack: Array<{ [key: string]: any }> = []; +/** One manager per POML compile (nested files do not count) */ +export type Context = { [key: string]: T }; +export class ContextManager { + private contextStore: { [key: string]: T } = {}; + private stack: Array<{ [key: string]: T }> = []; - public setGlobalVariable(key: string, value: any) { + public initialize(initialContext: { [key: string]: T }) { + this.contextStore = { ...initialContext }; + this.stack = []; + } + + public setGlobalVariable(key: string, value: T) { this.contextStore[key] = value; } - public setLocalVariable(key: string, value: any) { + public setLocalVariable(key: string, value: T) { if (this.stack.length === 0) { throw new Error('No local stack available'); } this.stack[this.stack.length - 1][key] = value; } - public pushStack() {} + public pushStack(context: Context) { + this.stack.push({ ...context }); + } + + public popStack() { + if (this.stack.length === 0) { + throw new Error('No local stack to pop'); + } + this.stack.pop(); + } } diff --git a/packages/poml/next/error.ts b/packages/poml/next/error.ts new file mode 100644 index 00000000..83928da6 --- /dev/null +++ b/packages/poml/next/error.ts @@ -0,0 +1,422 @@ +import * as path from 'path'; +import chalk from 'chalk'; +import { Diagnostic, Range, Severity } from './types'; +import sourceManager from './source'; + +/** + * Global Error Collector. + * + * Goals: + * + * 1. Centralized singleton that collects errors from anywhere in the codebase + * 2. Support for error types (error/warning), source locations (file, line, column, index ranges), and contextual data + * 3. Handle errors from embedded languages (JSON, JS expressions) with source mapping back to original positions + * 4. Track errors across multiple source files without conflicts + * 5. Collect multiple errors without stopping execution + * 6. Clear errors between compilation runs or test cases + * 7. Generate human-readable, formatted error messages with source context + */ +export class ErrorCollector { + private diagnostics: Diagnostic[] = []; + private suppressedCodes = new Set(); + private maxErrors = 100; + + /** + * Clear all collected errors + */ + public clear(): void { + this.diagnostics = []; + } + + /** + * Post an error + */ + public error(message: string, range?: Range, options: Partial = {}): void { + this.add({ + ...options, + severity: Severity.ERROR, + message, + range, + sourceFile: options.sourceFile || sourceManager.getCurrentFile(), + }); + } + + /** + * Post a warning + */ + public warning(message: string, range?: Range, options: Partial = {}): void { + this.add({ + ...options, + severity: Severity.WARNING, + message, + range, + sourceFile: options.sourceFile || sourceManager.getCurrentFile(), + }); + } + + /** + * Post an info message + */ + public info(message: string, range?: Range, options: Partial = {}): void { + this.add({ + ...options, + severity: Severity.INFO, + message, + range, + sourceFile: options.sourceFile || sourceManager.getCurrentFile(), + }); + } + + /** + * Add a diagnostic + */ + public add(diagnostic: Diagnostic): void { + // Check error limit + if (this.diagnostics.length >= this.maxErrors) { + if (this.diagnostics.length === this.maxErrors) { + this.diagnostics.push({ + severity: Severity.ERROR, + message: `Error limit reached (${this.maxErrors}). Further errors suppressed.`, + }); + } + return; + } + + // Skip suppressed error codes + if (diagnostic.code && this.suppressedCodes.has(diagnostic.code)) { + return; + } + + // Add current file if not specified + if (!diagnostic.sourceFile && sourceManager.getCurrentFile()) { + diagnostic.sourceFile = sourceManager.getCurrentFile(); + } + + this.diagnostics.push(diagnostic); + } + + /** + * Post a JSON parsing error with automatic position mapping + */ + public jsonError(originalError: Error, jsonRange: Range): void { + // Extract position from JSON parse error if available + const posMatch = originalError.message.match(/position (\d+)/); + let range = jsonRange; + + if (posMatch) { + const errorPos = parseInt(posMatch[1]); + // Map the JSON error position to the original source + range = { + start: jsonRange.start + errorPos, + end: jsonRange.start + errorPos + 1, + }; + } + + this.error(`JSON parsing error: ${originalError.message}`, range, { + code: 'JSON_PARSE_ERROR', + originalError, + hint: 'Check for trailing commas, unquoted keys, or undefined values', + }); + } + + /** + * Post a JavaScript expression evaluation error + */ + public expressionError(originalError: Error, expressionRange: Range, evalHeaderLength: number = 0): void { + // Adjust range if there's a header (like "return " or "const result = ") + const adjustedRange = + evalHeaderLength > 0 + ? { + start: expressionRange.start + evalHeaderLength, + end: expressionRange.end, + } + : expressionRange; + + // Try to extract line/column from error stack + const stackMatch = originalError.stack?.match(/:(\d+):(\d+)/); + let range = adjustedRange; + + if (stackMatch) { + const errorLine = parseInt(stackMatch[1]); + const errorCol = parseInt(stackMatch[2]); + + // If we have line/column info, try to be more precise + const currentFileContent = sourceManager.getCurrentFileContent(); + if (currentFileContent) { + const exprContent = currentFileContent.substring(expressionRange.start, expressionRange.end); + const lines = exprContent.split('\n'); + + if (errorLine <= lines.length) { + let offset = expressionRange.start; + for (let i = 0; i < errorLine - 1; i++) { + offset += lines[i].length + 1; // +1 for newline + } + offset += Math.min(errorCol - 1, lines[errorLine - 1].length); + + range = { + start: offset, + end: offset + 1, + }; + } + } + } + + this.error(`Expression evaluation failed: ${originalError.message}`, range, { + code: 'EXPRESSION_ERROR', + originalError, + hint: 'Check variable names and syntax in the expression', + }); + } + + /** + * Suppress errors with specific codes + */ + public suppressCode(code: string): void { + this.suppressedCodes.add(code); + } + + /** + * Format a single diagnostic for CLI output + */ + private formatDiagnostic(diagnostic: Diagnostic): string { + const parts: string[] = []; + + // Severity and code + const severityColor = { + [Severity.ERROR]: chalk.red, + [Severity.WARNING]: chalk.yellow, + [Severity.INFO]: chalk.blue, + }[diagnostic.severity]; + + let header = severityColor(diagnostic.severity.toUpperCase()); + + if (diagnostic.code) { + header += chalk.gray(` [${diagnostic.code}]`); + } + + // File location + if (diagnostic.sourceFile) { + const source = sourceManager.loadSource(diagnostic.sourceFile); + + if (source && diagnostic.range) { + const startPos = sourceManager.indexToPosition(source, diagnostic.range.start); + const location = `${diagnostic.sourceFile}:${startPos.line}:${startPos.column}`; + header += ` ${chalk.cyan(location)}`; + } else { + header += ` ${chalk.cyan(diagnostic.sourceFile)}`; + } + } + + parts.push(header); + + // Message + parts.push(` ${diagnostic.message}`); + + // Source context + if (diagnostic.sourceFile && diagnostic.range) { + const source = sourceManager.loadSource(diagnostic.sourceFile); + + if (source) { + const startPos = sourceManager.indexToPosition(source, diagnostic.range.start); + const endPos = sourceManager.indexToPosition(source, diagnostic.range.end); + + // Show context lines + const contextLines = 2; + const startLine = Math.max(0, startPos.line - contextLines - 1); + const endLine = Math.min(source.lines.length - 1, startPos.line + contextLines - 1); + + parts.push(''); + + for (let i = startLine; i <= endLine; i++) { + const lineNum = String(i + 1).padStart(4, ' '); + const isErrorLine = i === startPos.line - 1; + const pipe = isErrorLine ? '>' : '|'; + const lineColor = isErrorLine ? chalk.white : chalk.gray; + + parts.push(chalk.gray(` ${lineNum} ${pipe}`) + ' ' + lineColor(source.lines[i])); + + // Add error underline + if (isErrorLine) { + const spacing = ' '.repeat(startPos.column - 1 + 7); + let markerLength = 1; + + if (startPos.line === endPos.line) { + markerLength = Math.max(1, endPos.column - startPos.column); + } else { + markerLength = source.lines[i].length - startPos.column + 1; + } + + const marker = '^'.repeat(Math.min(markerLength, 80)); + parts.push(severityColor(spacing + marker)); + } + } + } + } + + // Hint + if (diagnostic.hint) { + parts.push(''); + parts.push(chalk.green(` 💡 ${diagnostic.hint}`)); + } + + return parts.join('\n'); + } + + /** + * Get all errors + */ + public getErrors(): Diagnostic[] { + return this.diagnostics.filter((d) => d.severity === Severity.ERROR); + } + + /** + * Get all warnings + */ + public getWarnings(): Diagnostic[] { + return this.diagnostics.filter((d) => d.severity === Severity.WARNING); + } + + /** + * Check if there are any errors + */ + public hasErrors(): boolean { + return this.getErrors().length > 0; + } + + /** + * Get count by severity + */ + public getCounts(): { errors: number; warnings: number; info: number } { + const counts = { errors: 0, warnings: 0, info: 0 }; + + for (const d of this.diagnostics) { + switch (d.severity) { + case Severity.ERROR: + counts.errors++; + break; + case Severity.WARNING: + counts.warnings++; + break; + case Severity.INFO: + counts.info++; + break; + } + } + + return counts; + } + + /** + * Format all diagnostics for CLI output + */ + public format( + options: { + showWarnings?: boolean; + showInfo?: boolean; + groupByFile?: boolean; + } = {}, + ): string { + const { showWarnings = true, showInfo = false, groupByFile = true } = options; + + const filtered = this.diagnostics.filter((d) => { + if (d.severity === Severity.ERROR) return true; + if (d.severity === Severity.WARNING) return showWarnings; + if (d.severity === Severity.INFO) return showInfo; + return false; + }); + + if (filtered.length === 0) { + return chalk.green('✓ No issues found'); + } + + const output: string[] = []; + + if (groupByFile) { + // Group by file + const byFile = new Map(); + const noFile: Diagnostic[] = []; + + for (const d of filtered) { + if (d.sourceFile) { + if (!byFile.has(d.sourceFile)) { + byFile.set(d.sourceFile, []); + } + byFile.get(d.sourceFile)!.push(d); + } else { + noFile.push(d); + } + } + + // Sort files + const sortedFiles = Array.from(byFile.keys()).sort(); + + for (const file of sortedFiles) { + output.push(chalk.underline.bold(path.relative(process.cwd(), file))); + output.push(''); + + const diagnostics = byFile.get(file)!.sort((a, b) => { + if (!a.range || !b.range) return 0; + return a.range.start - b.range.start; + }); + + for (const d of diagnostics) { + output.push(this.formatDiagnostic(d)); + output.push(''); + } + } + + // Add diagnostics without file + if (noFile.length > 0) { + output.push(chalk.underline.bold('General')); + output.push(''); + for (const d of noFile) { + output.push(this.formatDiagnostic(d)); + output.push(''); + } + } + } else { + // Simple list + for (const d of filtered) { + output.push(this.formatDiagnostic(d)); + output.push(''); + } + } + + // Summary + const counts = this.getCounts(); + const summary: string[] = []; + + if (counts.errors > 0) { + summary.push(chalk.red(`${counts.errors} error${counts.errors !== 1 ? 's' : ''}`)); + } + if (counts.warnings > 0 && showWarnings) { + summary.push(chalk.yellow(`${counts.warnings} warning${counts.warnings !== 1 ? 's' : ''}`)); + } + if (counts.info > 0 && showInfo) { + summary.push(chalk.blue(`${counts.info} info`)); + } + + output.push(chalk.bold(`Found ${summary.join(', ')}`)); + + return output.join('\n'); + } + + /** + * Print formatted errors to console + */ + public print(options?: Parameters[0]): void { + console.log(this.format(options)); + } + + /** + * Get all diagnostics + */ + public getDiagnostics(): ReadonlyArray { + return this.diagnostics; + } +} + +// Create singleton instance +const errorCollector = new ErrorCollector(); + +export default errorCollector; diff --git a/packages/poml/next/source.ts b/packages/poml/next/source.ts new file mode 100644 index 00000000..50a09fb3 --- /dev/null +++ b/packages/poml/next/source.ts @@ -0,0 +1,115 @@ +import * as fs from 'fs'; +import { SourceFileCache, Position } from './types'; + +export class SourceManager { + private sourceCache = new Map(); + private currentSourceFile?: string; + private currentSourceContent?: string; + + /** + * Set the current source file context for subsequent errors + */ + public setCurrentFile(sourceFile: string, content?: string): void { + this.currentSourceFile = sourceFile; + this.currentSourceContent = content; + + if (content && sourceFile) { + this.cacheSource(sourceFile, content); + } + } + + /** + * Clear current file context + */ + public clearCurrentFile(): void { + this.currentSourceFile = undefined; + this.currentSourceContent = undefined; + } + + public getCurrentFile(): string | undefined { + return this.currentSourceFile; + } + + public getCurrentFileContent(): string | undefined { + return this.currentSourceContent; + } + + /** + * Clear all + */ + public clear(): void { + this.sourceCache.clear(); + this.clearCurrentFile(); + } + + /** + * Cache source file content + */ + private cacheSource(file: string, content: string): void { + const lines = content.split('\n'); + const lineStarts: number[] = [0]; + + let pos = 0; + for (const line of lines) { + pos += line.length + 1; // +1 for newline + lineStarts.push(pos); + } + + this.sourceCache.set(file, { + content, + lines, + lineStarts, + }); + } + + /** + * Load source file if not cached + */ + public loadSource(file: string): SourceFileCache | null { + if (this.sourceCache.has(file)) { + return this.sourceCache.get(file)!; + } + + try { + const content = fs.readFileSync(file, 'utf8'); + this.cacheSource(file, content); + return this.sourceCache.get(file)!; + } catch (error) { + return null; + } + } + + /** + * Convert byte position to line/column + */ + public indexToPosition(source: SourceFileCache, index: number): Position { + const { lineStarts } = source; + + // Binary search for the line + let line = 0; + let left = 0; + let right = lineStarts.length - 1; + + while (left < right) { + const mid = Math.floor((left + right + 1) / 2); + if (lineStarts[mid] <= index) { + left = mid; + } else { + right = mid - 1; + } + } + + line = left; + const column = index - lineStarts[line]; + + return { + line: line + 1, // 1-based + column: column + 1, // 1-based + index, + }; + } +} + +// Create singleton instance +const sourceManager = new SourceManager(); +export default sourceManager; diff --git a/packages/poml/next/types.ts b/packages/poml/next/types.ts new file mode 100644 index 00000000..ce6ac3b5 --- /dev/null +++ b/packages/poml/next/types.ts @@ -0,0 +1,48 @@ +/** + * Range in source file (byte positions) + */ +export interface Range { + start: number; + end: number; +} + +/** + * Error severity levels + */ +export enum Severity { + ERROR = 'error', + WARNING = 'warning', + INFO = 'info', +} + +/** + * Diagnostic interface + */ +export interface Diagnostic { + severity: Severity; + message: string; + sourceFile?: string; + range?: Range; + code?: string; + hint?: string; + originalError?: Error; +} + +/** + * Position with line and column + */ +export interface Position { + line: number; + column: number; + index: number; +} + +/** + * Source file cache entry + */ +export interface SourceFileCache { + filePath?: string; + content: string; + lines: string[]; + lineStarts: number[]; +} From 3726698e2a41f676d442e4921a1ad8b4d4b76ef2 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Tue, 26 Aug 2025 17:46:07 +0800 Subject: [PATCH 21/76] . --- packages/poml/next/ast.ts | 189 ++++++++++++++++++++++++++++---------- 1 file changed, 143 insertions(+), 46 deletions(-) diff --git a/packages/poml/next/ast.ts b/packages/poml/next/ast.ts index c8db6b8a..8e54f437 100644 --- a/packages/poml/next/ast.ts +++ b/packages/poml/next/ast.ts @@ -1,61 +1,158 @@ import { Tokenizer, Token } from './tokenizer'; import componentDocs from '../assets/componentDocs.json'; +import { Range } from './types'; + +export interface Node { + kind: + | 'META' + | 'EXPRESSION' + | 'VALUE' + | 'STRING' + | 'VALUE' + | 'FORLOOP' + | 'OPEN' + | 'CLOSE' + | 'SELFCLOSE' + | 'ELEMENT' + | 'TEXT' + | 'POML' + | 'ATTRIBUTE' + | 'TEMPLATE'; + range: Range; // Range of the entire node in source +} + +export interface ExpressionNode extends Node { + kind: 'EXPRESSION'; + value: string; +} + +/** + * A template node could be: + * + * 1. the value in an attribute like `if="i > 0"` -> `"i > 0"` with quotes + * 2. a standalone template variable like `{{ userName }}` + * 3. + */ +export interface TemplateNode extends Node { + kind: 'TEMPLATE'; + value: ExpressionNode; +} + +/** + * A string node represents a pure text, without any quotes or template variables. + * + * It's also sometimes reused to represent a key, an identifier, or a tag name. + */ +export interface StringNode extends Node { + kind: 'STRING'; + value: string; +} + +/** + * A value node could be: + * + * 1. a quoted attribute value: "some text" or 'some text' + * 2. text content between tags with white spaces: > some text + * 3. quoted or not quoted template values: {{ someVar }} or "{{ var }}" + * 4. mixture of text and template variables: "Hello, {{ userName }}!" + * + * The value node always include the full range, including quotes if any. + * But it's children only include the inner parts, excluding quotes. + */ +export interface ValueNode extends Node { + kind: 'VALUE'; + children: (StringNode | TemplateNode)[]; +} -// Source position and attribute interfaces -export interface SourceRange { - start: number; - end: number; +/** + * A for loop node could be like: + * + * ``` + * + * ``` + * + * More advanced versions are not supported yet. + */ +export interface ForLoopNode extends Node { + kind: 'FORLOOP'; + iterator: StringNode; + collection: ExpressionNode; } -export interface AttributeInfo { - key: string; - value: (ASTNode & { kind: 'TEXT' | 'TEMPLATE' })[]; // Mixed content: array of text/template nodes - keyRange: SourceRange; // Position of attribute name - valueRange: SourceRange; // Position of attribute value (excluding quotes) - fullRange: SourceRange; // Full attribute including key="value" +export interface AttributeNode extends Node { + kind: 'ATTRIBUTE'; + key: StringNode; + value: ValueNode; } -// Main AST node interface -export interface ASTNode { - id: string; // Unique ID for caching and React keys - kind: 'META' | 'TEXT' | 'POML' | 'TEMPLATE'; - start: number; // Source position start of entire node - end: number; // Source position end of entire node - content: string; // The raw string content - parent?: ASTNode; // Reference to the parent node - children: ASTNode[]; // Child nodes - - // For POML and META nodes - tagName?: string; // Tag name (e.g., 'task', 'meta') - attributes?: AttributeInfo[]; // Detailed attribute information - - // Detailed source positions - openingTag?: { - start: number; // Position of '<' - end: number; // Position after '>' - nameRange: SourceRange; // Position of tag name - }; - - closingTag?: { - start: number; // Position of '' - nameRange: SourceRange; // Position of tag name in closing tag - }; - - contentRange?: SourceRange; // Position of content between tags (excluding nested tags) - - // For TEXT nodes - textSegments?: SourceRange[]; // Multiple ranges for text content (excluding nested POML) - - // For TEMPLATE nodes - expression?: string; // The full expression content between {{}} +export interface ForLoopAttributeNode extends Node { + kind: 'ATTRIBUTE'; + key: StringNode; // Always "for" + value: ForLoopNode; +} + +export interface OpenTagNode extends Node { + kind: 'OPEN'; + value: StringNode; + attributes: (AttributeNode | ForLoopAttributeNode)[]; +} + +export interface CloseTagNode extends Node { + kind: 'CLOSE'; + value: StringNode; +} + +export interface SelfCloseTagNode extends Node { + kind: 'SELFCLOSE'; + value: StringNode; + attributes: (AttributeNode | ForLoopAttributeNode)[]; +} + +export interface ElementNode extends Node { + kind: 'ELEMENT'; + tagName: StringNode; + children: (ElementNode | ValueNode)[]; +} + +export interface TextNode extends Node { + kind: 'TEXT'; + tagName: StringNode; // Always "text" + // We don't allow anything here yet. + attributes: AttributeNode[]; + value: StringNode; +} + +export interface MetaNode extends Node { + kind: 'META'; + tagName: StringNode; + attributes: AttributeNode[]; } -// AST Parser class class ASTParser { private tokens: Token[]; private position: number; private nextId: number; + + // These are the tags that are always valid in POML. + // You can not disable them. + private alwaysValidTags = new Set(['text', 'meta']); + + // These semantics are handled right here. + private nonComponentTags = new Set([ + 'let', + 'include', + 'template', + 'context', + 'stylesheet', + 'output-schema', + 'outputschema', + 'tool', + 'tool-def', + 'tool-definition', + 'tooldef', + 'tooldefinition', + ]); + private validPomlTags: Set; constructor(tokens: Token[]) { @@ -66,7 +163,7 @@ class ASTParser { } private buildValidTagsSet(): Set { - const validTags = new Set(); + const validTags = new Set(this.alwaysValidTags); for (const doc of componentDocs) { if (doc.name) { From c092ed7f5bd04b4a53d08e0ccc7319803034bb2e Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Tue, 26 Aug 2025 18:15:25 +0800 Subject: [PATCH 22/76] add ast --- packages/poml/next/ast.ts | 295 ++++++++++++++++++++++++++++++++++--- packages/poml/next/node.ts | 0 2 files changed, 271 insertions(+), 24 deletions(-) create mode 100644 packages/poml/next/node.ts diff --git a/packages/poml/next/ast.ts b/packages/poml/next/ast.ts index 8e54f437..f9943dcc 100644 --- a/packages/poml/next/ast.ts +++ b/packages/poml/next/ast.ts @@ -2,6 +2,22 @@ import { Tokenizer, Token } from './tokenizer'; import componentDocs from '../assets/componentDocs.json'; import { Range } from './types'; +/** + * Base interface for all AST nodes in the POML syntax tree. + * + * Every node in the AST must have a kind discriminator and a range indicating + * its position in the source text. The kind field enables TypeScript discriminated + * unions for type-safe node handling. + * + * Cases that apply: + * - All syntactic constructs in POML markup (elements, attributes, text, templates) + * - Meta-level constructs (root nodes, expression nodes) + * + * Cases that do not apply: + * - Lexical tokens (these are handled by the tokenizer) + * - Semantic information (component types, validation results) + * - Runtime values (evaluated expressions, resolved variables) + */ export interface Node { kind: | 'META' @@ -17,21 +33,52 @@ export interface Node { | 'TEXT' | 'POML' | 'ATTRIBUTE' - | 'TEMPLATE'; - range: Range; // Range of the entire node in source + | 'TEMPLATE' + | 'ROOT'; + range: Range; } +/** + * Represents a JavaScript expression as a string. + * + * This node stores raw expression text that will be evaluated at runtime. + * It serves as a wrapper for expressions used in various contexts like + * conditions, loops, and template interpolations. + * + * Cases that apply: + * - Conditional expressions: `i > 0`, `user.name === "admin"` + * - Collection accessors: `items.everything`, `data[0].value` + * - Function calls: `formatDate(now)`, `items.filter(x => x.active)` + * - Property paths: `user.profile.settings.theme` + * + * Cases that do not apply: + * - Template syntax including braces: `{{ expression }}` (use TemplateNode) + * - String literals with quotes: `"hello"` (use StringNode or ValueNode) + * - POML markup: `` (use element nodes) + */ export interface ExpressionNode extends Node { kind: 'EXPRESSION'; value: string; } /** - * A template node could be: + * Represents a template interpolation with double curly braces. + * + * Template nodes handle variable interpolation in POML, containing an + * expression that will be evaluated and substituted at runtime. The node + * preserves the template syntax for proper rendering and error reporting. * - * 1. the value in an attribute like `if="i > 0"` -> `"i > 0"` with quotes - * 2. a standalone template variable like `{{ userName }}` - * 3. + * Cases that apply: + * - Standalone template variables: `{{ userName }}`, `{{ count + 1 }}` + * - Template expressions in text: part of "Hello {{ name }}!" + * - Complex expressions: `{{ users.map(u => u.name).join(", ") }}` + * - Conditional rendering: `{{ isVisible ? "Show" : "Hide" }}` + * + * Cases that do not apply: + * - Attribute expressions without braces: `if="x > 0"` (use ExpressionNode) + * - Plain text: `Hello World` (use StringNode) + * - POML elements: `
    ` (use element nodes) + * - Single braces: `{ not a template }` (treated as plain text) */ export interface TemplateNode extends Node { kind: 'TEMPLATE'; @@ -39,9 +86,25 @@ export interface TemplateNode extends Node { } /** - * A string node represents a pure text, without any quotes or template variables. + * Represents plain text content without any special syntax. + * + * String nodes are the most basic content nodes, containing literal text + * that requires no processing. They are used both for content and as + * components of other nodes (like attribute keys and tag names). * - * It's also sometimes reused to represent a key, an identifier, or a tag name. + * Cases that apply: + * - Plain text content: `Hello World`, `This is a paragraph` + * - Long text blocks in `` elements: `some long text continued` + * - Attribute keys: the `class` in `class="container"` + * - Tag names: the `div` in `
    ` + * - Identifiers: variable names like `item` in for loops + * - Whitespace and formatting text between elements + * + * Cases that do not apply: + * - Text containing templates: `Hello {{ name }}` (use ValueNode with children) + * - Quoted strings in attributes: `"value"` (use ValueNode) + * - Expressions: `x > 0` (use ExpressionNode) + * - Template variables: `{{ var }}` (use TemplateNode) */ export interface StringNode extends Node { kind: 'STRING'; @@ -49,15 +112,26 @@ export interface StringNode extends Node { } /** - * A value node could be: + * Represents a composite value that may contain text and/or templates. + * + * Value nodes are containers for mixed content, handling both pure text + * and interpolated templates. They preserve quote information when used + * as attribute values and support complex content composition. * - * 1. a quoted attribute value: "some text" or 'some text' - * 2. text content between tags with white spaces: > some text - * 3. quoted or not quoted template values: {{ someVar }} or "{{ var }}" - * 4. mixture of text and template variables: "Hello, {{ userName }}!" + * Cases that apply: + * - Quoted attribute values: `"some text"`, `'single quoted'` + * - Mixed content with templates: `"Hello, {{ userName }}!"` + * - Text content between tags: `> some text <` (including whitespace) + * - Unquoted template values in certain contexts + * - Multi-part content: `"Price: ${{ amount }} USD"` * - * The value node always include the full range, including quotes if any. - * But it's children only include the inner parts, excluding quotes. + * Cases that do not apply: + * - Attribute keys: `class=...` (the `class` part uses StringNode) + * - Pure expressions without quotes: `if=condition` (use ExpressionNode) + * - Tag names: `div` (use StringNode) + * - Standalone template variables not in a value context + * + * Note: The range includes quotes if present, but children exclude them. */ export interface ValueNode extends Node { kind: 'VALUE'; @@ -65,13 +139,24 @@ export interface ValueNode extends Node { } /** - * A for loop node could be like: + * Represents a for-loop iteration construct in POML. + * + * For loops enable iterative rendering of elements, following the pattern + * "iterator in collection". This node captures both the loop variable + * and the collection expression for runtime evaluation. * - * ``` - * - * ``` + * Cases that apply: + * - Simple iteration: `item in items` + * - Property access: `user in data.users` + * - Array literals: `num in [1, 2, 3]` + * - Method calls: `result in getResults()` + * - Nested property iteration: `task in project.tasks.active` * - * More advanced versions are not supported yet. + * Cases that do not apply (not yet supported): + * - Advanced loop syntax (not yet supported): `(item, index) in items` + * - Destructuring patterns (not yet supported): `{name, age} in users` + * - Conditional loops: `if` attributes (use separate condition handling) + * - Template interpolation: `{{ items }}` (use TemplateNode) */ export interface ForLoopNode extends Node { kind: 'FORLOOP'; @@ -79,55 +164,217 @@ export interface ForLoopNode extends Node { collection: ExpressionNode; } +/** + * Represents a standard attribute on a POML element. + * + * Attributes provide metadata and configuration for elements. They consist + * of a key-value pair where the key is always a simple string and the value + * can be a complex composition of text and templates. + * + * Cases that apply: + * - Simple attributes: `class="container"`, `id='main'` + * - Boolean/presence attributes: `disabled`, `checked` + * - Template values: `title="{{ pageTitle }}"` or `title={{ pageTitle }}` + * - Mixed values: `placeholder="Enter {{ fieldName }}..."` + * + * Cases that do not apply: + * - For-loop attributes: `for="item in items"` (use ForLoopAttributeNode) + * - Spread attributes (not yet supported): `{...props}` + * - Dynamic attribute names (not supported): `[attrName]="value"` + */ export interface AttributeNode extends Node { kind: 'ATTRIBUTE'; key: StringNode; value: ValueNode; } +/** + * Represents a special for-loop attribute on POML elements. + * + * This specialized attribute node handles the `for` attribute specifically, + * which contains loop iteration syntax rather than a simple value. It enables + * elements to be rendered multiple times based on a collection. + * + * Cases that apply: + * - For attributes only: `for="item in items"` + * - Nested iterations: `for="subitem in item.children"` + * - Computed collections: `for="i in [...Array(5).keys()]"` + * + * Cases that do not apply: + * - Any attribute with a key other than "for" + * - Standard attributes: `class="..."` (use AttributeNode) + * - Conditional attributes: `if="..."` (use AttributeNode) + */ export interface ForLoopAttributeNode extends Node { kind: 'ATTRIBUTE'; - key: StringNode; // Always "for" + key: StringNode; value: ForLoopNode; } +/** + * Represents an opening tag in POML markup. + * + * Open tags mark the beginning of an element that expects a corresponding + * closing tag. They may contain attributes that configure the element's + * behavior and appearance. + * + * Cases that apply: + * - Standard opening tags: ``, `` + * - Tags with attributes: `
    ` + * - Tags with for-loops: `` + * - Nested structure beginnings: `
    ` before content + * + * Cases that do not apply: + * - Self-closing tags: `` (use SelfCloseTagNode) + * - Closing tags: `` (use CloseTagNode) + * - Complete elements: opening + content + closing (use ElementNode) + * - Invalid or malformed tags (treated as text) + */ export interface OpenTagNode extends Node { kind: 'OPEN'; value: StringNode; attributes: (AttributeNode | ForLoopAttributeNode)[]; } +/** + * Represents a closing tag in POML markup. + * + * Close tags mark the end of an element, matching a previously opened tag. + * They contain only the tag name and no attributes. + * + * Cases that apply: + * - Standard closing tags: ``, `` + * - Nested structure endings: `
    `, `
    ` + * - Any valid POML element closure + * + * Cases that do not apply: + * - Opening tags: `` (use OpenTagNode) + * - Self-closing tags: `
    ` (use SelfCloseTagNode) + * - Tags with attributes (closing tags never have attributes) + * - Mismatched closing tags (parser error) + */ export interface CloseTagNode extends Node { kind: 'CLOSE'; value: StringNode; } +/** + * Represents a self-closing tag in POML markup. + * + * Self-closing tags represent complete elements that have no children or + * content. They combine opening and closing in a single tag and may have + * attributes. + * + * Cases that apply: + * - Image elements: `` + * - Meta elements: `` + * - Data elements without content: `` + * - Any element explicitly self-closed: `` + * + * Cases that do not apply: + * - Elements with content: `
    content
    ` (use ElementNode) + * - Separate open/close tags: `
    ` (use ElementNode) + * - Tags without the self-closing slash: `` (use OpenTagNode) + * - Text content elements (these require open/close pairs) + */ export interface SelfCloseTagNode extends Node { kind: 'SELFCLOSE'; value: StringNode; attributes: (AttributeNode | ForLoopAttributeNode)[]; } +/** + * Represents a complete POML element with its content. + * + * Element nodes are high-level constructs that represent semantic POML + * components. They contain a tag name, optional attributes (inherited from + * open tag), and may have child content including other elements, text, + * or values. + * + * Cases that apply: + * - Document structures: `...content...` + * - Messages: `Hello` + * - Nested elements: `
    Text
    ` + * - Data components: `...rows...
    ` + * + * Cases that do not apply: + * - Self-closing elements: `` (use SelfCloseTagNode) + * - Raw text content: plain text outside elements (use TextNode) + * - Template variables: `{{ var }}` (use TemplateNode) + * - Meta elements: `` tags (use MetaNode) + */ export interface ElementNode extends Node { kind: 'ELEMENT'; tagName: StringNode; - children: (ElementNode | ValueNode)[]; + children: (ElementNode | TextNode | MetaNode | ValueNode)[]; } +/** + * Represents a text element that preserves literal content. + * + * Text nodes are special POML elements that treat their content as literal + * text, preventing template variable interpolation. They ensure content is + * preserved exactly as written, useful for code samples or pre-formatted text. + * + * Cases that apply: + * - Explicit text elements: `Literal {{ not_interpolated }}` + * + * Cases that do not apply: + * - Regular text content with interpolation (use ValueNode) + * - Plain text outside elements (use ValueNode) + * - Elements allowing template processing (use ElementNode) + * - Text with attributes enabling processing (future feature) + * + * Note: The tagName is always "text" for these nodes, and attributes must be empty. + */ export interface TextNode extends Node { kind: 'TEXT'; - tagName: StringNode; // Always "text" - // We don't allow anything here yet. + tagName: StringNode; attributes: AttributeNode[]; value: StringNode; } +/** + * Represents metadata elements in POML. + * + * Meta nodes provide document-level metadata and configuration that doesn't + * render as visible content. They typically appear at the document start and + * configure processing behavior, document properties, or provide auxiliary + * information. + * + * Cases that apply: + * - Document metadata: `` + * - Configuration: `` + * + * Cases that do not apply: + * - Any element that is not `` (use ElementNode) + */ export interface MetaNode extends Node { kind: 'META'; tagName: StringNode; attributes: AttributeNode[]; } +/** + * Represents the root node of a POML document tree. + * + * Root nodes serve as the top-level container for all document content when + * there isn't an explicit `` wrapper. They provide a consistent entry + * point for document traversal and processing. + * + * Cases that apply: + * - Documents without `` wrapper + * - Documents with multiple top-level elements + * - Documents with `` but surrounded by white spaces or comments + * + * Cases that do not apply: + * - All nested elements + */ +export interface RootNode extends Node { + kind: 'ROOT'; + children: (ElementNode | TextNode | MetaNode | ValueNode)[]; +} + class ASTParser { private tokens: Token[]; private position: number; diff --git a/packages/poml/next/node.ts b/packages/poml/next/node.ts new file mode 100644 index 00000000..e69de29b From 82e028d72b70e29d17a084ddb4584d3f519a857a Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Tue, 26 Aug 2025 18:23:53 +0800 Subject: [PATCH 23/76] fix cst --- packages/poml/next/cst.ts | 74 ++--- packages/poml/next/node.ts | 0 packages/poml/next/{ast.ts => nodes.ts} | 377 ------------------------ 3 files changed, 23 insertions(+), 428 deletions(-) delete mode 100644 packages/poml/next/node.ts rename packages/poml/next/{ast.ts => nodes.ts} (55%) diff --git a/packages/poml/next/cst.ts b/packages/poml/next/cst.ts index 6ede8a39..ff13c4b0 100644 --- a/packages/poml/next/cst.ts +++ b/packages/poml/next/cst.ts @@ -16,62 +16,14 @@ import { } from './lexer'; import { listComponentAliases } from '../base'; - -// Source position interfaces -export interface SourceRange { - start: number; - end: number; -} - -export interface AttributeInfo { - key: string; - value: (ASTNode & { kind: 'TEXT' | 'TEMPLATE' })[]; - keyRange: SourceRange; - valueRange: SourceRange; - fullRange: SourceRange; -} - -// Core AST node interface -export interface ASTNode { - id: string; - kind: 'META' | 'TEXT' | 'POML' | 'TEMPLATE'; - start: number; - end: number; - content: string; - parent?: ASTNode; - children: ASTNode[]; - - // For POML and META nodes - tagName?: string; - attributes?: AttributeInfo[]; - - // Detailed source positions - openingTag?: { - start: number; - end: number; - nameRange: SourceRange; - }; - - closingTag?: { - start: number; - end: number; - nameRange: SourceRange; - }; - - contentRange?: SourceRange; - - // For TEXT nodes - textSegments?: SourceRange[]; - - // For TEMPLATE nodes - expression?: string; -} +import * as Nodes from './nodes'; // Context for parsing configuration export interface PomlContext { variables: { [key: string]: any }; stylesheet: { [key: string]: string }; - minimalPomlVersion?: string; + minPomlVersion?: string; + maxPomlVersion?: string; sourcePath: string; enabledComponents: Set; unknownComponentBehavior: 'error' | 'warning' | 'ignore'; @@ -85,6 +37,26 @@ export class CSTParser { private context: PomlContext; private nodeIdCounter: number; + // These are the tags that are always valid in POML. + // You can not disable them. + private alwaysValidTags = new Set(['text', 'meta']); + + // These semantics are handled right here. + private nonComponentTags = new Set([ + 'let', + 'include', + 'template', + 'context', + 'stylesheet', + 'output-schema', + 'outputschema', + 'tool', + 'tool-def', + 'tool-definition', + 'tooldef', + 'tooldefinition', + ]); + constructor(context: PomlContext) { this.tokens = []; this.position = 0; diff --git a/packages/poml/next/node.ts b/packages/poml/next/node.ts deleted file mode 100644 index e69de29b..00000000 diff --git a/packages/poml/next/ast.ts b/packages/poml/next/nodes.ts similarity index 55% rename from packages/poml/next/ast.ts rename to packages/poml/next/nodes.ts index f9943dcc..d4995984 100644 --- a/packages/poml/next/ast.ts +++ b/packages/poml/next/nodes.ts @@ -1,5 +1,3 @@ -import { Tokenizer, Token } from './tokenizer'; -import componentDocs from '../assets/componentDocs.json'; import { Range } from './types'; /** @@ -374,378 +372,3 @@ export interface RootNode extends Node { kind: 'ROOT'; children: (ElementNode | TextNode | MetaNode | ValueNode)[]; } - -class ASTParser { - private tokens: Token[]; - private position: number; - private nextId: number; - - // These are the tags that are always valid in POML. - // You can not disable them. - private alwaysValidTags = new Set(['text', 'meta']); - - // These semantics are handled right here. - private nonComponentTags = new Set([ - 'let', - 'include', - 'template', - 'context', - 'stylesheet', - 'output-schema', - 'outputschema', - 'tool', - 'tool-def', - 'tool-definition', - 'tooldef', - 'tooldefinition', - ]); - - private validPomlTags: Set; - - constructor(tokens: Token[]) { - this.tokens = tokens; - this.position = 0; - this.nextId = 0; - this.validPomlTags = this.buildValidTagsSet(); - } - - private buildValidTagsSet(): Set { - const validTags = new Set(this.alwaysValidTags); - - for (const doc of componentDocs) { - if (doc.name) { - validTags.add(doc.name.toLowerCase()); - // Convert camelCase to kebab-case - validTags.add( - doc.name - .toLowerCase() - .replace(/([A-Z])/g, '-$1') - .toLowerCase(), - ); - } - } - - // Add special tags - validTags.add('poml'); - validTags.add('text'); - validTags.add('meta'); - - return validTags; - } - - private generateId(): string { - return `ast_${this.nextId++}`; - } - - private peek(): Token | undefined { - return this.tokens[this.position]; - } - - private advance(): Token | undefined { - return this.tokens[this.position++]; - } - - private extractTagName(tagContent: string): string { - // Remove < and > and any attributes - const content = tagContent.slice(1, -1); - const match = content.match(/^\/?\s*([a-zA-Z][\w-]*)/); - return match ? match[1] : ''; - } - - private parseAttributeValue(value: string): (ASTNode & { kind: 'TEXT' | 'TEMPLATE' })[] { - // Parse attribute value for mixed text and template variables - const result: (ASTNode & { kind: 'TEXT' | 'TEMPLATE' })[] = []; - let currentPos = 0; - - while (currentPos < value.length) { - const templateStart = value.indexOf('{{', currentPos); - - if (templateStart === -1) { - // No more template variables, add remaining text - if (currentPos < value.length) { - result.push({ - id: this.generateId(), - kind: 'TEXT', - start: currentPos, - end: value.length, - content: value.substring(currentPos), - children: [], - }); - } - break; - } - - // Add text before template variable - if (templateStart > currentPos) { - result.push({ - id: this.generateId(), - kind: 'TEXT', - start: currentPos, - end: templateStart, - content: value.substring(currentPos, templateStart), - children: [], - }); - } - - // Find end of template variable - const templateEnd = value.indexOf('}}', templateStart + 2); - if (templateEnd === -1) { - // Malformed template, treat as text - result.push({ - id: this.generateId(), - kind: 'TEXT', - start: templateStart, - end: value.length, - content: value.substring(templateStart), - children: [], - }); - break; - } - - // Add template variable - const templateContent = value.substring(templateStart + 2, templateEnd); - result.push({ - id: this.generateId(), - kind: 'TEMPLATE', - start: templateStart, - end: templateEnd + 2, - content: value.substring(templateStart, templateEnd + 2), - expression: templateContent.trim(), - children: [], - }); - - currentPos = templateEnd + 2; - } - - return result; - } - - private parseAttributes(tagContent: string): AttributeInfo[] { - const attributes: AttributeInfo[] = []; - - // Simple attribute parsing - can be enhanced later - const attrRegex = /(\w+)=["']([^"']*?)["']/g; - let match; - - while ((match = attrRegex.exec(tagContent)) !== null) { - const key = match[1]; - const value = match[2]; - const fullMatch = match[0]; - const matchStart = match.index; - - attributes.push({ - key, - value: this.parseAttributeValue(value), - keyRange: { start: matchStart, end: matchStart + key.length }, - valueRange: { start: matchStart + key.length + 2, end: matchStart + key.length + 2 + value.length }, - fullRange: { start: matchStart, end: matchStart + fullMatch.length }, - }); - } - - return attributes; - } - - parse(): ASTNode { - const children = this.parseNodes(); - - if (children.length === 1 && children[0].kind === 'POML') { - return children[0]; - } - - // Create root text node - const rootNode: ASTNode = { - id: this.generateId(), - kind: 'TEXT', - start: 0, - end: this.tokens.length > 0 ? this.tokens[this.tokens.length - 1].end : 0, - content: this.tokens.map((t) => t.value).join(''), - children, - textSegments: [], - }; - - // Set parent references - children.forEach((child) => { - child.parent = rootNode; - }); - - return rootNode; - } - - private parseNodes(): ASTNode[] { - const nodes: ASTNode[] = []; - - while (this.position < this.tokens.length) { - const token = this.peek(); - if (!token) break; - - if (token.type === 'TEMPLATE_VAR') { - nodes.push(this.parseTemplateVariable()); - } else if (token.type === 'TAG_OPEN') { - const tagName = this.extractTagName(token.value); - - if (this.validPomlTags.has(tagName.toLowerCase())) { - const node = this.parsePomlNode(); - if (node) { - nodes.push(node); - } - } else { - // Invalid tag, treat as text - nodes.push(this.parseTextFromToken()); - } - } else if (token.type === 'TEXT') { - nodes.push(this.parseTextFromToken()); - } else { - // Skip other token types for now - this.advance(); - } - } - - return nodes; - } - - private parseTemplateVariable(): ASTNode { - const token = this.advance()!; - const expression = token.value.slice(2, -2).trim(); // Remove {{ and }} - - return { - id: this.generateId(), - kind: 'TEMPLATE', - start: token.start, - end: token.end, - content: token.value, - expression, - children: [], - }; - } - - private parseTextFromToken(): ASTNode { - const token = this.advance()!; - - return { - id: this.generateId(), - kind: 'TEXT', - start: token.start, - end: token.end, - content: token.value, - children: [], - textSegments: [{ start: token.start, end: token.end }], - }; - } - - private parsePomlNode(): ASTNode | null { - const openToken = this.advance()!; - const tagName = this.extractTagName(openToken.value); - - // Parse attributes - const attributes = this.parseAttributes(openToken.value); - - // Determine node kind - const kind = tagName.toLowerCase() === 'meta' ? 'META' : 'POML'; - - const node: ASTNode = { - id: this.generateId(), - kind, - start: openToken.start, - end: openToken.end, // Will be updated when we find closing tag - content: openToken.value, // Will be updated - tagName: tagName.toLowerCase(), - attributes, - children: [], - openingTag: { - start: openToken.start, - end: openToken.end, - nameRange: { - start: openToken.start + 1, - end: openToken.start + 1 + tagName.length, - }, - }, - }; - - // Parse children until we find the closing tag - const children: ASTNode[] = []; - let depth = 1; - - while (this.position < this.tokens.length && depth > 0) { - const token = this.peek(); - if (!token) break; - - if (token.type === 'TAG_OPEN') { - const childTagName = this.extractTagName(token.value); - if (childTagName.toLowerCase() === tagName.toLowerCase()) { - depth++; - } - - // Special handling for text tags - don't process template variables - if (tagName.toLowerCase() === 'text') { - children.push(this.parseTextFromToken()); - } else if (this.validPomlTags.has(childTagName.toLowerCase())) { - const childNode = this.parsePomlNode(); - if (childNode) { - childNode.parent = node; - children.push(childNode); - } - } else { - children.push(this.parseTextFromToken()); - } - } else if (token.type === 'TAG_CLOSE') { - const closeTagName = this.extractTagName(token.value); - if (closeTagName.toLowerCase() === tagName.toLowerCase()) { - depth--; - if (depth === 0) { - // Found our closing tag - const closeToken = this.advance()!; - node.end = closeToken.end; - node.closingTag = { - start: closeToken.start, - end: closeToken.end, - nameRange: { - start: closeToken.start + 2, - end: closeToken.start + 2 + tagName.length, - }, - }; - break; - } - } - this.advance(); - } else if (token.type === 'TEMPLATE_VAR' && tagName.toLowerCase() !== 'text') { - // Only parse template variables outside of text tags - const templateNode = this.parseTemplateVariable(); - templateNode.parent = node; - children.push(templateNode); - } else { - const textNode = this.parseTextFromToken(); - textNode.parent = node; - children.push(textNode); - } - } - - node.children = children; - - // Update content to include full tag - if (node.closingTag) { - node.content = this.tokens - .slice( - this.tokens.findIndex((t) => t.start === node.start), - this.tokens.findIndex((t) => t.end === node.end) + 1, - ) - .map((t) => t.value) - .join(''); - } - - return node; - } -} - -// Main parsing function -export function parseAST(content: string): ASTNode { - const tokenizer = new Tokenizer(content); - const tokens = tokenizer.tokenize(); - const parser = new ASTParser(tokens); - return parser.parse(); -} - -export class PomlAstParser { - static parse(content: string): ASTNode { - return parseAST(content); - } -} From 297a50c0decb81ff13d4031b8151a0c8f6b5761c Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Fri, 29 Aug 2025 14:34:09 +0800 Subject: [PATCH 24/76] update nodes --- packages/poml/next/lexer.ts | 21 +++-- packages/poml/next/nodes.ts | 169 ++++++++++++++++++++---------------- 2 files changed, 107 insertions(+), 83 deletions(-) diff --git a/packages/poml/next/lexer.ts b/packages/poml/next/lexer.ts index a646defd..1074146b 100644 --- a/packages/poml/next/lexer.ts +++ b/packages/poml/next/lexer.ts @@ -16,15 +16,18 @@ export const SingleQuote = createToken({ name: 'SingleQuote', pattern: /'/ }); export const Backslash = createToken({ name: 'Backslash', pattern: /\\/ }); /* Identifier is one of the following: - - XML tag names - - XML attribute names - - TextContent incorrectly parsed as identifiers - - Case 3 is handled later by CST parser. -*/ + * - XML tag names + * - XML attribute names + * - TextContent incorrectly parsed as identifiers + * + * Notes: + * 1. In case 1, tags can contain : (namespaces) and . (extensions). + * These are handled later by CST parser. + * 2. In case 3, CST parser will reclassify as TextContent if needed. + */ export const Identifier = createToken({ name: 'Identifier', - pattern: /[a-zA-Z_][a-zA-Z0-9_-]*/, + pattern: /[a-zA-Z_][a-zA-Z0-9_\-]*/, }); export const Whitespace = createToken({ @@ -33,7 +36,7 @@ export const Whitespace = createToken({ line_breaks: true, }); -/* eslint-disable no-irregular-whitespace */ + /* Catch-all for arbitrary text content - Match any char except: < — starts a tag @@ -46,7 +49,7 @@ export const TextContent = createToken({ pattern: /(?:[^<"'{}]|{(?!{)|}(?!}))+/, line_breaks: true, }); -/* eslint-enable no-irregular-whitespace */ + // Define token order - more specific patterns first export const allTokens = [ diff --git a/packages/poml/next/nodes.ts b/packages/poml/next/nodes.ts index d4995984..ba6cb1ae 100644 --- a/packages/poml/next/nodes.ts +++ b/packages/poml/next/nodes.ts @@ -1,41 +1,5 @@ import { Range } from './types'; -/** - * Base interface for all AST nodes in the POML syntax tree. - * - * Every node in the AST must have a kind discriminator and a range indicating - * its position in the source text. The kind field enables TypeScript discriminated - * unions for type-safe node handling. - * - * Cases that apply: - * - All syntactic constructs in POML markup (elements, attributes, text, templates) - * - Meta-level constructs (root nodes, expression nodes) - * - * Cases that do not apply: - * - Lexical tokens (these are handled by the tokenizer) - * - Semantic information (component types, validation results) - * - Runtime values (evaluated expressions, resolved variables) - */ -export interface Node { - kind: - | 'META' - | 'EXPRESSION' - | 'VALUE' - | 'STRING' - | 'VALUE' - | 'FORLOOP' - | 'OPEN' - | 'CLOSE' - | 'SELFCLOSE' - | 'ELEMENT' - | 'TEXT' - | 'POML' - | 'ATTRIBUTE' - | 'TEMPLATE' - | 'ROOT'; - range: Range; -} - /** * Represents a JavaScript expression as a string. * @@ -54,13 +18,15 @@ export interface Node { * - String literals with quotes: `"hello"` (use StringNode or ValueNode) * - POML markup: `` (use element nodes) */ -export interface ExpressionNode extends Node { +export interface ExpressionNode { kind: 'EXPRESSION'; + range: Range; value: string; } /** - * Represents a template interpolation with double curly braces. + * Represents a template interpolation with double curly braces, + * or sometimes without braces in specific attributes. * * Template nodes handle variable interpolation in POML, containing an * expression that will be evaluated and substituted at runtime. The node @@ -71,15 +37,18 @@ export interface ExpressionNode extends Node { * - Template expressions in text: part of "Hello {{ name }}!" * - Complex expressions: `{{ users.map(u => u.name).join(", ") }}` * - Conditional rendering: `{{ isVisible ? "Show" : "Hide" }}` + * - Template usage in if attributes: `condition` in `if="condition"` * * Cases that do not apply: - * - Attribute expressions without braces: `if="x > 0"` (use ExpressionNode) + * - Full attribute expressions: `if="x > 0"` (use ExpressionNode) * - Plain text: `Hello World` (use StringNode) - * - POML elements: `
    ` (use element nodes) * - Single braces: `{ not a template }` (treated as plain text) + * - Template elements: (use TextNode) + * - With quotes: `"{{ var }}"` (use ValueNode) */ -export interface TemplateNode extends Node { +export interface TemplateNode { kind: 'TEMPLATE'; + range: Range; value: ExpressionNode; } @@ -104,8 +73,9 @@ export interface TemplateNode extends Node { * - Expressions: `x > 0` (use ExpressionNode) * - Template variables: `{{ var }}` (use TemplateNode) */ -export interface StringNode extends Node { +export interface StringNode { kind: 'STRING'; + range: Range; value: string; } @@ -121,7 +91,7 @@ export interface StringNode extends Node { * - Mixed content with templates: `"Hello, {{ userName }}!"` * - Text content between tags: `> some text <` (including whitespace) * - Unquoted template values in certain contexts - * - Multi-part content: `"Price: ${{ amount }} USD"` + * - Multi-part content: `"Price: ${{amount}} USD"` * * Cases that do not apply: * - Attribute keys: `class=...` (the `class` part uses StringNode) @@ -131,8 +101,9 @@ export interface StringNode extends Node { * * Note: The range includes quotes if present, but children exclude them. */ -export interface ValueNode extends Node { +export interface ValueNode { kind: 'VALUE'; + range: Range; children: (StringNode | TemplateNode)[]; } @@ -156,8 +127,9 @@ export interface ValueNode extends Node { * - Conditional loops: `if` attributes (use separate condition handling) * - Template interpolation: `{{ items }}` (use TemplateNode) */ -export interface ForLoopNode extends Node { - kind: 'FORLOOP'; +export interface ForIteratorNode { + kind: 'FORITERATOR'; + range: Range; iterator: StringNode; collection: ExpressionNode; } @@ -171,17 +143,18 @@ export interface ForLoopNode extends Node { * * Cases that apply: * - Simple attributes: `class="container"`, `id='main'` - * - Boolean/presence attributes: `disabled`, `checked` * - Template values: `title="{{ pageTitle }}"` or `title={{ pageTitle }}` * - Mixed values: `placeholder="Enter {{ fieldName }}..."` * * Cases that do not apply: + * - Boolean/presence attributes: `disabled`, `checked` (not yet supported) * - For-loop attributes: `for="item in items"` (use ForLoopAttributeNode) * - Spread attributes (not yet supported): `{...props}` * - Dynamic attribute names (not supported): `[attrName]="value"` */ -export interface AttributeNode extends Node { +export interface AttributeNode { kind: 'ATTRIBUTE'; + range: Range; key: StringNode; value: ValueNode; } @@ -203,8 +176,9 @@ export interface AttributeNode extends Node { * - Standard attributes: `class="..."` (use AttributeNode) * - Conditional attributes: `if="..."` (use AttributeNode) */ -export interface ForLoopAttributeNode extends Node { - kind: 'ATTRIBUTE'; +export interface ForLoopAttributeNode { + kind: 'FORATTRIBUTE'; + range: Range; key: StringNode; value: ForLoopNode; } @@ -228,8 +202,9 @@ export interface ForLoopAttributeNode extends Node { * - Complete elements: opening + content + closing (use ElementNode) * - Invalid or malformed tags (treated as text) */ -export interface OpenTagNode extends Node { +export interface OpenTagNode { kind: 'OPEN'; + range: Range; value: StringNode; attributes: (AttributeNode | ForLoopAttributeNode)[]; } @@ -249,34 +224,34 @@ export interface OpenTagNode extends Node { * - Opening tags: `` (use OpenTagNode) * - Self-closing tags: `
    ` (use SelfCloseTagNode) * - Tags with attributes (closing tags never have attributes) - * - Mismatched closing tags (parser error) */ -export interface CloseTagNode extends Node { +export interface CloseTagNode { kind: 'CLOSE'; + range: Range; value: StringNode; } /** * Represents a self-closing tag in POML markup. * - * Self-closing tags represent complete elements that have no children or + * Self-closing elements represent complete elements that have no children or * content. They combine opening and closing in a single tag and may have * attributes. * * Cases that apply: * - Image elements: `` - * - Meta elements: `` - * - Data elements without content: `` - * - Any element explicitly self-closed: `` + * - Runtime configurations: `` * * Cases that do not apply: + * - Meta elements: `` * - Elements with content: `
    content
    ` (use ElementNode) * - Separate open/close tags: `
    ` (use ElementNode) * - Tags without the self-closing slash: `` (use OpenTagNode) - * - Text content elements (these require open/close pairs) + * - Meta elements: `` tags (use MetaNode) */ -export interface SelfCloseTagNode extends Node { +export interface SelfCloseElementNode { kind: 'SELFCLOSE'; + range: Range; value: StringNode; attributes: (AttributeNode | ForLoopAttributeNode)[]; } @@ -290,25 +265,26 @@ export interface SelfCloseTagNode extends Node { * or values. * * Cases that apply: - * - Document structures: `...content...` - * - Messages: `Hello` + * - Any elements: `...content...` + * - Output schemas with templates: `{{ schemaDefinition }}` * - Nested elements: `
    Text
    ` - * - Data components: `...rows...
    ` * * Cases that do not apply: * - Self-closing elements: `` (use SelfCloseTagNode) - * - Raw text content: plain text outside elements (use TextNode) + * - Literal text content: plain text (use TextNode) * - Template variables: `{{ var }}` (use TemplateNode) * - Meta elements: `` tags (use MetaNode) */ -export interface ElementNode extends Node { +export interface ElementNode { kind: 'ELEMENT'; - tagName: StringNode; + range: Range; + open: OpenTagNode; + close: CloseTagNode; children: (ElementNode | TextNode | MetaNode | ValueNode)[]; } /** - * Represents a text element that preserves literal content. + * Represents an element that preserves literal content. * * Text nodes are special POML elements that treat their content as literal * text, preventing template variable interpolation. They ensure content is @@ -316,6 +292,7 @@ export interface ElementNode extends Node { * * Cases that apply: * - Explicit text elements: `Literal {{ not_interpolated }}` + * - External templates: `` * * Cases that do not apply: * - Regular text content with interpolation (use ValueNode) @@ -323,23 +300,27 @@ export interface ElementNode extends Node { * - Elements allowing template processing (use ElementNode) * - Text with attributes enabling processing (future feature) * - * Note: The tagName is always "text" for these nodes, and attributes must be empty. + * Note: The tagName (value) can only be "text" or "template" in this version. */ -export interface TextNode extends Node { +export interface TextNode { kind: 'TEXT'; - tagName: StringNode; + range: Range; + open: OpenTagNode; + close: CloseTagNode; attributes: AttributeNode[]; - value: StringNode; + children: StringNode; } /** - * Represents metadata elements in POML. + * Represents metadata elements in POML. Meta elements must be self-closed. * * Meta nodes provide document-level metadata and configuration that doesn't * render as visible content. They typically appear at the document start and * configure processing behavior, document properties, or provide auxiliary * information. * + * Value must be "meta" (case-insensitive). + * * Cases that apply: * - Document metadata: `` * - Configuration: `` @@ -347,9 +328,10 @@ export interface TextNode extends Node { * Cases that do not apply: * - Any element that is not `` (use ElementNode) */ -export interface MetaNode extends Node { +export interface MetaNode { kind: 'META'; - tagName: StringNode; + range: Range; + value: StringNode; attributes: AttributeNode[]; } @@ -368,7 +350,46 @@ export interface MetaNode extends Node { * Cases that do not apply: * - All nested elements */ -export interface RootNode extends Node { +export interface RootNode { kind: 'ROOT'; + range: Range; children: (ElementNode | TextNode | MetaNode | ValueNode)[]; } + +// Keep these keys required; everything else becomes recursively optional +type DeepPartialExcept = + // arrays + T extends (infer U)[] + ? DeepPartialExcept[] + : // functions (leave as-is) + T extends (...args: any) => any + ? T + : // objects + T extends object + ? { [P in keyof T as P extends K ? P : never]-?: T[P] } & { + [P in keyof T as P extends K ? never : P]?: DeepPartialExcept | undefined; + } + : T; + +// Keep only "kind" required; everything else is optional, recursively. +type Draft = DeepPartialExcept; + +// Union of your strict nodes +export type StrictNode = + | ExpressionNode + | TemplateNode + | StringNode + | ValueNode + | ForLoopNode + | AttributeNode + | ForLoopAttributeNode + | OpenTagNode + | CloseTagNode + | SelfCloseElementNode + | ElementNode + | TextNode + | MetaNode + | RootNode; + +// The "loose" counterpart you can safely produce during parsing. +export type DraftNode = Draft; From af9c69cfdd5afcd48f4f7b0463869ee098c2340d Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Sat, 30 Aug 2025 12:16:45 +0800 Subject: [PATCH 25/76] . --- packages/poml/next/lexer.ts | 4 +++- packages/poml/next/nodes.ts | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/packages/poml/next/lexer.ts b/packages/poml/next/lexer.ts index 1074146b..9d58174b 100644 --- a/packages/poml/next/lexer.ts +++ b/packages/poml/next/lexer.ts @@ -1,7 +1,9 @@ import { createToken, Lexer } from 'chevrotain'; // Define token types for extended POML -export const Comment = createToken({ name: 'Comment', pattern: // }); +export const CommentOpen = createToken({ name: 'CommentOpen', pattern: // }); +export const Pragma = createToken({ name: 'Pragma', pattern: /\b@pragma\b/i }); export const TemplateOpen = createToken({ name: 'TemplateOpen', pattern: /{{/ }); export const TemplateClose = createToken({ name: 'TemplateClose', pattern: /}}/ }); export const TagClosingOpen = createToken({ name: 'TagClosingOpen', pattern: /<\// }); diff --git a/packages/poml/next/nodes.ts b/packages/poml/next/nodes.ts index ba6cb1ae..bf734333 100644 --- a/packages/poml/next/nodes.ts +++ b/packages/poml/next/nodes.ts @@ -289,6 +289,8 @@ export interface ElementNode { * Text nodes are special POML elements that treat their content as literal * text, preventing template variable interpolation. They ensure content is * preserved exactly as written, useful for code samples or pre-formatted text. + * When `` is used, the parser eats everything including tags and comments, + * including new `` tags, until a matching `` is found. * * Cases that apply: * - Explicit text elements: `Literal {{ not_interpolated }}` From 97e8ba233b64541f29f5e0293cf3777c9ffbf308 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Sat, 30 Aug 2025 12:52:43 +0800 Subject: [PATCH 26/76] . --- packages/poml/next/nodes.ts | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/packages/poml/next/nodes.ts b/packages/poml/next/nodes.ts index bf734333..0d682204 100644 --- a/packages/poml/next/nodes.ts +++ b/packages/poml/next/nodes.ts @@ -43,7 +43,7 @@ export interface ExpressionNode { * - Full attribute expressions: `if="x > 0"` (use ExpressionNode) * - Plain text: `Hello World` (use StringNode) * - Single braces: `{ not a template }` (treated as plain text) - * - Template elements: (use TextNode) + * - Template elements: (use LiteralNode) * - With quotes: `"{{ var }}"` (use ValueNode) */ export interface TemplateNode { @@ -271,7 +271,7 @@ export interface SelfCloseElementNode { * * Cases that do not apply: * - Self-closing elements: `` (use SelfCloseTagNode) - * - Literal text content: plain text (use TextNode) + * - Literal text content: plain text (use LiteralNode) * - Template variables: `{{ var }}` (use TemplateNode) * - Meta elements: `` tags (use MetaNode) */ @@ -280,13 +280,13 @@ export interface ElementNode { range: Range; open: OpenTagNode; close: CloseTagNode; - children: (ElementNode | TextNode | MetaNode | ValueNode)[]; + children: (ElementNode | LiteralNode | CommentNode | PragmaNode | ValueNode)[]; } /** * Represents an element that preserves literal content. * - * Text nodes are special POML elements that treat their content as literal + * Literal nodes are special POML elements that treat their content as literal * text, preventing template variable interpolation. They ensure content is * preserved exactly as written, useful for code samples or pre-formatted text. * When `` is used, the parser eats everything including tags and comments, @@ -294,7 +294,6 @@ export interface ElementNode { * * Cases that apply: * - Explicit text elements: `Literal {{ not_interpolated }}` - * - External templates: `` * * Cases that do not apply: * - Regular text content with interpolation (use ValueNode) @@ -302,9 +301,11 @@ export interface ElementNode { * - Elements allowing template processing (use ElementNode) * - Text with attributes enabling processing (future feature) * - * Note: The tagName (value) can only be "text" or "template" in this version. + * Note: The tagName (value) can only be "text" in this version. + * Literal node is different from elements which do not support children. + * Literal node is handled on the CST parsing stage. */ -export interface TextNode { +export interface LiteralNode { kind: 'TEXT'; range: Range; open: OpenTagNode; @@ -321,14 +322,9 @@ export interface TextNode { * configure processing behavior, document properties, or provide auxiliary * information. * - * Value must be "meta" (case-insensitive). - * * Cases that apply: - * - Document metadata: `` - * - Configuration: `` - * - * Cases that do not apply: - * - Any element that is not `` (use ElementNode) + * - Document metadata: `` + * - Configuration: `` */ export interface MetaNode { kind: 'META'; @@ -355,7 +351,7 @@ export interface MetaNode { export interface RootNode { kind: 'ROOT'; range: Range; - children: (ElementNode | TextNode | MetaNode | ValueNode)[]; + children: (ElementNode | LiteralNode | MetaNode | ValueNode)[]; } // Keep these keys required; everything else becomes recursively optional @@ -382,14 +378,14 @@ export type StrictNode = | TemplateNode | StringNode | ValueNode - | ForLoopNode + | ForIteratorNode | AttributeNode | ForLoopAttributeNode | OpenTagNode | CloseTagNode | SelfCloseElementNode | ElementNode - | TextNode + | LiteralNode | MetaNode | RootNode; From ddf120ca2d2080fb5508a3e77c97c174bb2b8b00 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Sat, 30 Aug 2025 13:49:57 +0800 Subject: [PATCH 27/76] . --- packages/poml/next/cst.ts | 978 ++++++++++-------------------------- packages/poml/next/nodes.ts | 65 ++- 2 files changed, 315 insertions(+), 728 deletions(-) diff --git a/packages/poml/next/cst.ts b/packages/poml/next/cst.ts index ff13c4b0..80417f22 100644 --- a/packages/poml/next/cst.ts +++ b/packages/poml/next/cst.ts @@ -1,718 +1,286 @@ -import { IToken } from 'chevrotain'; -import { - extendedPomlLexer, - TemplateOpen, - TemplateClose, - TagClosingOpen, - TagSelfClose, - TagOpen, - TagClose, - Equals, - DoubleQuote, - SingleQuote, - Identifier, - Whitespace, - TextContent, -} from './lexer'; - -import { listComponentAliases } from '../base'; -import * as Nodes from './nodes'; - -// Context for parsing configuration -export interface PomlContext { - variables: { [key: string]: any }; - stylesheet: { [key: string]: string }; - minPomlVersion?: string; - maxPomlVersion?: string; - sourcePath: string; - enabledComponents: Set; - unknownComponentBehavior: 'error' | 'warning' | 'ignore'; -} - -// CST Parser class -export class CSTParser { - private tokens: IToken[]; - private position: number; - private text: string; - private context: PomlContext; - private nodeIdCounter: number; - - // These are the tags that are always valid in POML. - // You can not disable them. - private alwaysValidTags = new Set(['text', 'meta']); - - // These semantics are handled right here. - private nonComponentTags = new Set([ - 'let', - 'include', - 'template', - 'context', - 'stylesheet', - 'output-schema', - 'outputschema', - 'tool', - 'tool-def', - 'tool-definition', - 'tooldef', - 'tooldefinition', - ]); - - constructor(context: PomlContext) { - this.tokens = []; - this.position = 0; - this.text = ''; - this.context = context; - this.nodeIdCounter = 0; - - // Initialize default enabled components (can be extended/disabled via meta tags) - this.context.enabledComponents = new Set(listComponentAliases()); - this.context.unknownComponentBehavior = 'warning'; - } - - private generateId(): string { - return `node_${++this.nodeIdCounter}`; - } - - private currentToken(): IToken | undefined { - return this.tokens[this.position]; - } - - private peekToken(offset: number = 1): IToken | undefined { - return this.tokens[this.position + offset]; +export class PomlCstParser extends CstParser { + // Define rules as public methods + public document!: () => DocumentCstNode; + public content!: () => ContentCstNode; + public element!: () => ElementCstNode; + public literalElement!: () => LiteralElementCstNode; + public selfCloseElement!: () => SelfCloseElementCstNode; + public openTag!: () => OpenTagCstNode; + public closeTag!: () => CloseTagCstNode; + public attributes!: () => AttributesCstNode; + public attribute!: () => AttributeCstNode; + public attributeValue!: () => AttributeValueCstNode; + public quotedValue!: () => QuotedValueCstNode; + public unquotedValue!: () => UnquotedValueCstNode; + public valueContent!: () => ValueContentCstNode; + public escapedChar!: () => EscapedCharCstNode; + public forIterator!: () => ForIteratorCstNode; + public template!: () => TemplateCstNode; + public value!: () => ValueCstNode; + public valueElement!: () => ValueElementCstNode; + public comment!: () => CommentCstNode; + public pragma!: () => PragmaCstNode; + + constructor() { + super(allTokens, { + recoveryEnabled: true, + nodeLocationTracking: 'full', + }); + + this.performSelfAnalysis(); } - private consumeToken(): IToken | undefined { - if (this.position < this.tokens.length) { - return this.tokens[this.position++]; - } - return undefined; - } - - private skipWhitespace(): void { - while (this.currentToken()?.tokenType === Whitespace) { - this.position++; - } - } - - public parse(text: string): ASTNode { - this.text = text; - const lexResult = extendedPomlLexer.tokenize(text); - this.tokens = lexResult.tokens; - this.position = 0; - - const rootNode: ASTNode = { - id: this.generateId(), - kind: 'TEXT', - start: 0, - end: text.length, - content: text, - children: [], - textSegments: [], - }; - - this.parseDocument(rootNode); - return rootNode; - } - - private parseDocument(rootNode: ASTNode): void { - while (this.position < this.tokens.length) { - const token = this.currentToken(); - if (!token) { - break; - } - - if (token.tokenType === TagOpen) { - const nextToken = this.peekToken(); - if (nextToken?.tokenType === Identifier) { - const tagName = nextToken.image; - - if (tagName === 'meta') { - const metaNode = this.parseMetaTag(); - if (metaNode) { - rootNode.children.push(metaNode); - metaNode.parent = rootNode; - this.processMeta(metaNode); - } - } else if (this.context.enabledComponents.has(tagName)) { - const pomlNode = this.parsePomlElement(); - if (pomlNode) { - rootNode.children.push(pomlNode); - pomlNode.parent = rootNode; - } - } else { - // Unknown tag - treat as text - this.handleUnknownTag(tagName); - const textNode = this.parseTextContent(); - if (textNode) { - rootNode.children.push(textNode); - textNode.parent = rootNode; - } - } - } else { - // Malformed tag - treat as text - const textNode = this.parseTextContent(); - if (textNode) { - rootNode.children.push(textNode); - textNode.parent = rootNode; - } - } - } else { - const textNode = this.parseTextContent(); - if (textNode) { - rootNode.children.push(textNode); - textNode.parent = rootNode; - } - } - } - } - - private parseMetaTag(): ASTNode | null { - const startPos = this.position; - const openTagStart = this.currentToken()?.startOffset || 0; - - this.consumeToken(); // consume '<' - this.skipWhitespace(); - - const nameToken = this.consumeToken(); // consume 'meta' - if (!nameToken || nameToken.image !== 'meta') { - return null; - } - - const nameRange: SourceRange = { - start: nameToken.startOffset || 0, - end: (nameToken.endOffset || 0) + 1, - }; - - this.skipWhitespace(); - - const attributes = this.parseAttributes(); - - this.skipWhitespace(); - - // Check for self-closing or regular closing - const closeToken = this.currentToken(); - let openTagEnd = 0; - let hasContent = false; - - if (closeToken?.tokenType === TagSelfClose) { - this.consumeToken(); // consume '/>' - openTagEnd = (closeToken.endOffset || 0) + 1; - } else if (closeToken?.tokenType === TagClose) { - this.consumeToken(); // consume '>' - openTagEnd = (closeToken.endOffset || 0) + 1; - hasContent = true; - } - - const metaNode: ASTNode = { - id: this.generateId(), - kind: 'META', - start: openTagStart, - end: openTagEnd, // Will be updated if there's content - content: '', - children: [], - tagName: 'meta', - attributes, - openingTag: { - start: openTagStart, - end: openTagEnd, - nameRange, + // Document is the root rule + private documentRule = this.RULE('document', () => { + this.MANY(() => { + this.OR([{ ALT: () => this.CONSUME(Whitespace) }, { ALT: () => this.SUBRULE(this.content) }]); + }); + }); + + // Content can be elements, comments, pragmas, or values + private contentRule = this.RULE('content', () => { + this.OR([ + { ALT: () => this.SUBRULE(this.pragma) }, + { ALT: () => this.SUBRULE(this.comment) }, + { ALT: () => this.SUBRULE(this.element) }, + { ALT: () => this.SUBRULE(this.literalElement) }, + { ALT: () => this.SUBRULE(this.selfCloseElement) }, + { ALT: () => this.SUBRULE(this.value) }, + ]); + }); + + // Regular element with open/close tags + private elementRule = this.RULE('element', () => { + const openTag = this.SUBRULE(this.openTag); + this.MANY(() => { + this.OR([{ ALT: () => this.CONSUME(Whitespace) }, { ALT: () => this.SUBRULE(this.content) }]); + }); + this.SUBRULE(this.closeTag); + }); + + // Literal element (like ) that preserves content + private literalElementRule = this.RULE('literalElement', () => { + this.SUBRULE(this.openTag); + // Consume everything until matching close tag + this.MANY(() => { + this.OR([ + // Look ahead for closing tag + { + GATE: () => !this.isClosingTag(), + ALT: () => this.consumeAny(), + }, + ]); + }); + this.SUBRULE(this.closeTag); + }); + + // Self-closing element + private selfCloseElementRule = this.RULE('selfCloseElement', () => { + this.CONSUME(TagOpen); + this.CONSUME(Identifier, { LABEL: 'tagName' }); + this.OPTION(() => { + this.CONSUME(Whitespace); + this.OPTION2(() => this.SUBRULE(this.attributes)); + }); + this.CONSUME(TagSelfClose); + }); + + // Opening tag + private openTagRule = this.RULE('openTag', () => { + this.CONSUME(TagOpen); + this.CONSUME(Identifier, { LABEL: 'tagName' }); + this.OPTION(() => { + this.CONSUME(Whitespace); + this.OPTION2(() => this.SUBRULE(this.attributes)); + }); + this.CONSUME(TagClose); + }); + + // Closing tag + private closeTagRule = this.RULE('closeTag', () => { + this.CONSUME(TagClosingOpen); + this.CONSUME(Identifier, { LABEL: 'tagName' }); + this.OPTION(() => this.CONSUME(Whitespace)); + this.CONSUME(TagClose); + }); + + // Attributes + private attributesRule = this.RULE('attributes', () => { + this.MANY_SEP({ + SEP: Whitespace, + DEF: () => this.SUBRULE(this.attribute), + }); + }); + + // Single attribute + private attributeRule = this.RULE('attribute', () => { + this.CONSUME(Identifier, { LABEL: 'key' }); + this.CONSUME(Equals); + this.SUBRULE(this.attributeValue); + }); + + // Attribute value (quoted, unquoted, or for iterator) + private attributeValueRule = this.RULE('attributeValue', () => { + this.OR([ + { ALT: () => this.SUBRULE(this.quotedValue) }, + { ALT: () => this.SUBRULE(this.unquotedValue) }, + // Special case for for="item in items" + { + GATE: () => this.isForAttribute(), + ALT: () => this.SUBRULE(this.forIterator), }, - }; - - if (hasContent) { - // Parse content until closing tag - while (this.position < this.tokens.length) { - const token = this.currentToken(); - if (token?.tokenType === TagClosingOpen) { - const nextToken = this.peekToken(); - if (nextToken?.tokenType === Identifier && nextToken.image === 'meta') { - break; - } - } - this.position++; - } - - // Parse closing tag - if (this.currentToken()?.tokenType === TagClosingOpen) { - const closingTagStart = this.currentToken()?.startOffset || 0; - this.consumeToken(); // consume '' - - if (closingNameToken && finalClose) { - metaNode.closingTag = { - start: closingTagStart, - end: (finalClose.endOffset || 0) + 1, - nameRange: { - start: closingNameToken.startOffset || 0, - end: (closingNameToken.endOffset || 0) + 1, - }, - }; - metaNode.end = (finalClose.endOffset || 0) + 1; - } - } - } - - metaNode.content = this.text.slice(metaNode.start, metaNode.end); - return metaNode; - } - - private parsePomlElement(): ASTNode | null { - const openTagStart = this.currentToken()?.startOffset || 0; - - this.consumeToken(); // consume '<' - this.skipWhitespace(); - - const nameToken = this.consumeToken(); - if (!nameToken) { - return null; - } - - const tagName = nameToken.image; - const nameRange: SourceRange = { - start: nameToken.startOffset || 0, - end: (nameToken.endOffset || 0) + 1, - }; - - this.skipWhitespace(); - - const attributes = this.parseAttributes(); - - this.skipWhitespace(); - - // Check for self-closing or regular closing - const closeToken = this.currentToken(); - let openTagEnd = 0; - let hasContent = false; - - if (closeToken?.tokenType === TagSelfClose) { - this.consumeToken(); // consume '/>' - openTagEnd = (closeToken.endOffset || 0) + 1; - } else if (closeToken?.tokenType === TagClose) { - this.consumeToken(); // consume '>' - openTagEnd = (closeToken.endOffset || 0) + 1; - hasContent = true; - } - - const pomlNode: ASTNode = { - id: this.generateId(), - kind: 'POML', - start: openTagStart, - end: openTagEnd, // Will be updated if there's content - content: '', - children: [], - tagName, - attributes, - openingTag: { - start: openTagStart, - end: openTagEnd, - nameRange, + ]); + }); + + // Quoted value + private quotedValueRule = this.RULE('quotedValue', () => { + this.OR([ + { + ALT: () => { + this.CONSUME(DoubleQuote, { LABEL: 'openQuote' }); + this.MANY(() => { + this.SUBRULE(this.valueContent); + }); + this.CONSUME2(DoubleQuote, { LABEL: 'closeQuote' }); + }, }, - }; - - if (hasContent) { - if (tagName === 'text') { - // Special handling for tags - parse content as pure text - this.parseTextContentForTextTag(pomlNode); - } else { - // Parse mixed content (POML and text) - this.parseMixedContent(pomlNode); - } - - // Parse closing tag - if (this.currentToken()?.tokenType === TagClosingOpen) { - const closingTagStart = this.currentToken()?.startOffset || 0; - this.consumeToken(); // consume '' - - if (closingNameToken && finalClose) { - pomlNode.closingTag = { - start: closingTagStart, - end: (finalClose.endOffset || 0) + 1, - nameRange: { - start: closingNameToken.startOffset || 0, - end: (closingNameToken.endOffset || 0) + 1, - }, - }; - pomlNode.end = (finalClose.endOffset || 0) + 1; - } - } - } - - pomlNode.content = this.text.slice(pomlNode.start, pomlNode.end); - return pomlNode; - } - - private parseTextContentForTextTag(parentNode: ASTNode): void { - // In tags, we parse content as pure text but still need to handle nested POML - while (this.position < this.tokens.length) { - const token = this.currentToken(); - if (!token) { - break; - } - - if (token.tokenType === TagClosingOpen) { - const nextToken = this.peekToken(); - if (nextToken?.tokenType === Identifier && nextToken.image === parentNode.tagName) { - break; // Found closing tag - } - } - - if (token.tokenType === TagOpen) { - const nextToken = this.peekToken(); - if (nextToken?.tokenType === Identifier && this.context.enabledComponents.has(nextToken.image)) { - // Found nested POML element - const nestedNode = this.parsePomlElement(); - if (nestedNode) { - parentNode.children.push(nestedNode); - nestedNode.parent = parentNode; - } - } else { - // Treat as text - const textNode = this.parseTextContent(); - if (textNode) { - parentNode.children.push(textNode); - textNode.parent = parentNode; - } - } - } else { - const textNode = this.parseTextContent(); - if (textNode) { - parentNode.children.push(textNode); - textNode.parent = parentNode; - } - } - } - } - - private parseMixedContent(parentNode: ASTNode): void { - while (this.position < this.tokens.length) { - const token = this.currentToken(); - if (!token) { - break; - } - - if (token.tokenType === TagClosingOpen) { - const nextToken = this.peekToken(); - if (nextToken?.tokenType === Identifier && nextToken.image === parentNode.tagName) { - break; // Found closing tag - } - } - - if (token.tokenType === TagOpen) { - const nextToken = this.peekToken(); - if (nextToken?.tokenType === Identifier && this.context.enabledComponents.has(nextToken.image)) { - // Found nested POML element - const nestedNode = this.parsePomlElement(); - if (nestedNode) { - parentNode.children.push(nestedNode); - nestedNode.parent = parentNode; - } - } else { - // Unknown tag or malformed - treat as text - const textNode = this.parseTextContent(); - if (textNode) { - parentNode.children.push(textNode); - textNode.parent = parentNode; - } - } - } else if (token.tokenType === TemplateOpen) { - // Parse template expression - const templateNode = this.parseTemplate(); - if (templateNode) { - parentNode.children.push(templateNode); - templateNode.parent = parentNode; - } - } else { - const textNode = this.parseTextContent(); - if (textNode) { - parentNode.children.push(textNode); - textNode.parent = parentNode; - } - } - } - } - - private parseTextContent(): ASTNode | null { - const startOffset = this.currentToken()?.startOffset || 0; - let endOffset = startOffset; - - // Collect consecutive text tokens - while (this.position < this.tokens.length) { - const token = this.currentToken(); - if (!token) { - break; - } - - if (token.tokenType === TextContent || token.tokenType === Whitespace) { - endOffset = (token.endOffset || 0) + 1; - this.position++; - } else if ( - token.tokenType === TagOpen || - token.tokenType === TemplateOpen || - token.tokenType === TagClosingOpen - ) { - break; - } else { - // Other tokens treated as text in this context - endOffset = (token.endOffset || 0) + 1; - this.position++; - } - } - - if (endOffset === startOffset) { - return null; - } - - const textNode: ASTNode = { - id: this.generateId(), - kind: 'TEXT', - start: startOffset, - end: endOffset, - content: this.text.slice(startOffset, endOffset), - children: [], - textSegments: [{ start: startOffset, end: endOffset }], - }; - - return textNode; - } - - private parseTemplate(): ASTNode | null { - const startToken = this.currentToken(); - if (!startToken || startToken.tokenType !== TemplateOpen) { - return null; - } - - const startOffset = startToken.startOffset || 0; - this.consumeToken(); // consume '{{' - - let expression = ''; - let endOffset = startOffset + 2; - - // Collect content until TemplateClose - while (this.position < this.tokens.length) { - const token = this.currentToken(); - if (!token) { - break; - } - - if (token.tokenType === TemplateClose) { - endOffset = (token.endOffset || 0) + 1; - this.consumeToken(); - break; - } else { - expression += token.image; - endOffset = (token.endOffset || 0) + 1; - this.consumeToken(); - } - } - - const templateNode: ASTNode = { - id: this.generateId(), - kind: 'TEMPLATE', - start: startOffset, - end: endOffset, - content: this.text.slice(startOffset, endOffset), - children: [], - expression: expression.trim(), - }; - - return templateNode; - } - - private parseAttributes(): AttributeInfo[] { - const attributes: AttributeInfo[] = []; - - while (this.position < this.tokens.length) { - this.skipWhitespace(); - - const token = this.currentToken(); - if (!token || token.tokenType !== Identifier) { - break; - } - - const keyToken = this.consumeToken()!; - const keyRange: SourceRange = { - start: keyToken.startOffset || 0, - end: (keyToken.endOffset || 0) + 1, - }; - - this.skipWhitespace(); - - if (this.currentToken()?.tokenType !== Equals) { - // Boolean attribute - attributes.push({ - key: keyToken.image, - value: [ - { - id: this.generateId(), - kind: 'TEXT', - start: keyRange.start, - end: keyRange.end, - content: 'true', - children: [], - }, - ], - keyRange, - valueRange: keyRange, - fullRange: keyRange, - }); - continue; - } - - this.consumeToken(); // consume '=' - this.skipWhitespace(); - - const quoteToken = this.currentToken(); - if (!quoteToken || (quoteToken.tokenType !== DoubleQuote && quoteToken.tokenType !== SingleQuote)) { - break; // Invalid attribute - } - - const isDoubleQuote = quoteToken.tokenType === DoubleQuote; - const valueStart = (quoteToken.endOffset || 0) + 1; - this.consumeToken(); // consume opening quote - - const valueNodes: (ASTNode & { kind: 'TEXT' | 'TEMPLATE' })[] = []; - let valueEnd = valueStart; - - // Parse attribute value content - while (this.position < this.tokens.length) { - const token = this.currentToken(); - if (!token) { - break; - } - - if ((isDoubleQuote && token.tokenType === DoubleQuote) || (!isDoubleQuote && token.tokenType === SingleQuote)) { - valueEnd = token.startOffset || valueEnd; - this.consumeToken(); // consume closing quote - break; - } else if (token.tokenType === TemplateOpen) { - const templateNode = this.parseTemplate(); - if (templateNode && (templateNode.kind === 'TEXT' || templateNode.kind === 'TEMPLATE')) { - valueNodes.push(templateNode as ASTNode & { kind: 'TEXT' | 'TEMPLATE' }); - } - } else { - // Collect text content - const textStart = token.startOffset || 0; - let textEnd = (token.endOffset || 0) + 1; - let textContent = token.image; - - this.consumeToken(); - - // Collect more text tokens - while (this.position < this.tokens.length) { - const nextToken = this.currentToken(); - if (!nextToken) { - break; - } - - if ( - (isDoubleQuote && nextToken.tokenType === DoubleQuote) || - (!isDoubleQuote && nextToken.tokenType === SingleQuote) || - nextToken.tokenType === TemplateOpen - ) { - break; - } - - textContent += nextToken.image; - textEnd = (nextToken.endOffset || 0) + 1; - this.consumeToken(); - } - - valueNodes.push({ - id: this.generateId(), - kind: 'TEXT', - start: textStart, - end: textEnd, - content: textContent, - children: [], + { + ALT: () => { + this.CONSUME(SingleQuote, { LABEL: 'openQuote' }); + this.MANY2(() => { + this.SUBRULE2(this.valueContent); }); - } - } - - const valueRange: SourceRange = { start: valueStart, end: valueEnd }; - const fullRange: SourceRange = { - start: keyRange.start, - end: (this.tokens[this.position - 1]?.endOffset || 0) + 1, - }; - - attributes.push({ - key: keyToken.image, - value: valueNodes, - keyRange, - valueRange, - fullRange, - }); - } - - return attributes; + this.CONSUME2(SingleQuote, { LABEL: 'closeQuote' }); + }, + }, + ]); + }); + + // Unquoted value (template or expression) + private unquotedValueRule = this.RULE('unquotedValue', () => { + this.OR([ + { ALT: () => this.SUBRULE(this.template) }, + { ALT: () => this.CONSUME(Identifier, { LABEL: 'expression' }) }, + { ALT: () => this.CONSUME(TextContent, { LABEL: 'expression' }) }, + ]); + }); + + // Value content inside quotes + private valueContentRule = this.RULE('valueContent', () => { + this.OR([ + { ALT: () => this.SUBRULE(this.template) }, + { ALT: () => this.SUBRULE(this.escapedChar) }, + { ALT: () => this.CONSUME(TextContent, { LABEL: 'text' }) }, + { ALT: () => this.CONSUME(Identifier, { LABEL: 'text' }) }, + { ALT: () => this.CONSUME(Whitespace, { LABEL: 'text' }) }, + ]); + }); + + // Escaped character + private escapedCharRule = this.RULE('escapedChar', () => { + this.CONSUME(Backslash); + this.OR([ + { ALT: () => this.CONSUME(DoubleQuote, { LABEL: 'char' }) }, + { ALT: () => this.CONSUME(SingleQuote, { LABEL: 'char' }) }, + { ALT: () => this.CONSUME(Backslash, { LABEL: 'char' }) }, + { ALT: () => this.CONSUME(Identifier, { LABEL: 'char' }) }, + ]); + }); + + // For iterator (item in items) + private forIteratorRule = this.RULE('forIterator', () => { + this.CONSUME(Identifier, { LABEL: 'iterator' }); + this.OPTION(() => this.CONSUME(Whitespace, { LABEL: 'Whitespace1' })); + this.CONSUME2(Identifier, { LABEL: 'in' }); // "in" keyword + this.OPTION2(() => this.CONSUME2(Whitespace, { LABEL: 'Whitespace2' })); + // Collection can be complex expression + this.AT_LEAST_ONE(() => { + this.OR([ + { ALT: () => this.CONSUME3(Identifier, { LABEL: 'collection' }) }, + { ALT: () => this.CONSUME(TextContent, { LABEL: 'collection' }) }, + ]); + }); + }); + + // Template {{ expression }} + private templateRule = this.RULE('template', () => { + this.CONSUME(TemplateOpen); + this.MANY(() => { + this.OR([ + { ALT: () => this.CONSUME(Whitespace, { LABEL: 'expression' }) }, + { ALT: () => this.CONSUME(Identifier, { LABEL: 'expression' }) }, + { ALT: () => this.CONSUME(TextContent, { LABEL: 'expression' }) }, + ]); + }); + this.CONSUME(TemplateClose); + }); + + // Value (text and/or templates) + private valueRule = this.RULE('value', () => { + this.AT_LEAST_ONE(() => { + this.SUBRULE(this.valueElement); + }); + }); + + // Value element (text or template) + private valueElementRule = this.RULE('valueElement', () => { + this.OR([ + { ALT: () => this.SUBRULE(this.template) }, + { ALT: () => this.CONSUME(TextContent, { LABEL: 'text' }) }, + { ALT: () => this.CONSUME(Identifier, { LABEL: 'text' }) }, + { ALT: () => this.CONSUME(Whitespace, { LABEL: 'text' }) }, + ]); + }); + + // Comment + private commentRule = this.RULE('comment', () => { + this.CONSUME(CommentOpen); + this.MANY(() => { + this.OR([ + { + GATE: () => !this.isCommentClose(), + ALT: () => this.consumeAny({ LABEL: 'commentContent' }), + }, + ]); + }); + this.CONSUME(CommentClose); + }); + + // Pragma + private pragmaRule = this.RULE('pragma', () => { + this.CONSUME(CommentOpen); + this.OPTION(() => this.CONSUME(Whitespace, { LABEL: 'Whitespace1' })); + this.CONSUME(Pragma); + this.MANY(() => { + this.OR([ + { + GATE: () => !this.isCommentClose(), + ALT: () => this.consumeAny({ LABEL: 'pragmaContent' }), + }, + ]); + }); + this.CONSUME(CommentClose); + }); + + // Helper methods + private isClosingTag(): boolean { + return this.LA(1).tokenType === TagClosingOpen; } - private processMeta(metaNode: ASTNode): void { - if (!metaNode.attributes) { - return; - } - - for (const attr of metaNode.attributes) { - switch (attr.key) { - case 'components': - this.processComponentsAttribute(attr.value); - break; - case 'unknownComponents': - const behavior = attr.value[0]?.content; // eslint-disable-line - if (behavior === 'error' || behavior === 'warning' || behavior === 'ignore') { - this.context.unknownComponentBehavior = behavior; - } - break; - case 'minimalPomlVersion': - this.context.minimalPomlVersion = attr.value[0]?.content; - break; - // Add other meta attributes as needed - } - } + private isCommentClose(): boolean { + return this.LA(1).tokenType === CommentClose; } - private processComponentsAttribute(value: (ASTNode & { kind: 'TEXT' | 'TEMPLATE' })[]): void { - const components = value[0]?.content || ''; - const parts = components.split(',').map((s) => s.trim()); - - for (const part of parts) { - if (part.startsWith('+')) { - this.context.enabledComponents.add(part.slice(1)); - } else if (part.startsWith('-')) { - this.context.enabledComponents.delete(part.slice(1)); - } - } + private isForAttribute(): boolean { + // Check if previous token was "for" as attribute key + const prevTokens = this.input.slice(Math.max(0, this.currIdx - 3), this.currIdx); + return prevTokens.some((t) => t.image.toLowerCase() === 'for'); } - private handleUnknownTag(tagName: string): void { - switch (this.context.unknownComponentBehavior) { - case 'error': - throw new Error(`Unknown POML component: ${tagName}`); - case 'warning': - console.warn(`Unknown POML component: ${tagName}`); - break; - case 'ignore': - // Do nothing - break; - } + private consumeAny(options?: { LABEL?: string }): IToken { + // Consume any token + const token = this.LA(1); + this.input[this.currIdx++]; + return token; } } - -// Export function to create and use the parser -export function parseExtendedPoml(text: string, context: Partial = {}): ASTNode { - const fullContext: PomlContext = { - variables: {}, - stylesheet: {}, - sourcePath: '', - enabledComponents: new Set(), - unknownComponentBehavior: 'warning', - ...context, - }; - - const parser = new CSTParser(fullContext); - return parser.parse(text); -} diff --git a/packages/poml/next/nodes.ts b/packages/poml/next/nodes.ts index 0d682204..567bdc9d 100644 --- a/packages/poml/next/nodes.ts +++ b/packages/poml/next/nodes.ts @@ -1,4 +1,5 @@ import { Range } from './types'; +import { IToken } from 'chevrotain'; /** * Represents a JavaScript expression as a string. @@ -24,6 +25,8 @@ export interface ExpressionNode { value: string; } +export interface ExpressionCstNode {} + /** * Represents a template interpolation with double curly braces, * or sometimes without braces in specific attributes. @@ -180,7 +183,7 @@ export interface ForLoopAttributeNode { kind: 'FORATTRIBUTE'; range: Range; key: StringNode; - value: ForLoopNode; + value: ForIteratorNode; } /** @@ -247,7 +250,6 @@ export interface CloseTagNode { * - Elements with content: `
    content
    ` (use ElementNode) * - Separate open/close tags: `
    ` (use ElementNode) * - Tags without the self-closing slash: `` (use OpenTagNode) - * - Meta elements: `` tags (use MetaNode) */ export interface SelfCloseElementNode { kind: 'SELFCLOSE'; @@ -283,6 +285,41 @@ export interface ElementNode { children: (ElementNode | LiteralNode | CommentNode | PragmaNode | ValueNode)[]; } +/** + * Represents an HTML-like line/block comment in POML. + * + * Comment nodes preserve authoring notes or disabled content that should not + * affect rendering. The `value` holds the comment text without the `` + * delimiters. + * + * Examples: + * - `` + */ +export interface CommentNode { + kind: 'COMMENT'; + range: Range; + value: StringNode; +} + +/** + * Represents a pragma directive carried inside a comment. + * + * Pragmas are special instructions for parser/compiler. They usually appear + * inside comments and start with `@pragma`. For now we keep this node simple + * with a single `value` that contains the full directive text after + * `@pragma` (e.g. `components +reference -table`). + * + * Examples: + * - Specify version: `` + * - Turn tags on/off: `` + * - Turn speaker roles on/off: `` or `single` + */ +export interface PragmaNode { + kind: 'PRAGMA'; + range: Range; + value: StringNode; +} + /** * Represents an element that preserves literal content. * @@ -314,25 +351,6 @@ export interface LiteralNode { children: StringNode; } -/** - * Represents metadata elements in POML. Meta elements must be self-closed. - * - * Meta nodes provide document-level metadata and configuration that doesn't - * render as visible content. They typically appear at the document start and - * configure processing behavior, document properties, or provide auxiliary - * information. - * - * Cases that apply: - * - Document metadata: `` - * - Configuration: `` - */ -export interface MetaNode { - kind: 'META'; - range: Range; - value: StringNode; - attributes: AttributeNode[]; -} - /** * Represents the root node of a POML document tree. * @@ -351,7 +369,7 @@ export interface MetaNode { export interface RootNode { kind: 'ROOT'; range: Range; - children: (ElementNode | LiteralNode | MetaNode | ValueNode)[]; + children: (ElementNode | LiteralNode | CommentNode | PragmaNode | ValueNode)[]; } // Keep these keys required; everything else becomes recursively optional @@ -386,7 +404,8 @@ export type StrictNode = | SelfCloseElementNode | ElementNode | LiteralNode - | MetaNode + | CommentNode + | PragmaNode | RootNode; // The "loose" counterpart you can safely produce during parsing. From 7e2d3f0e9d226cb056f9c14c71cc72654a60a293 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Sat, 30 Aug 2025 18:44:49 +0800 Subject: [PATCH 28/76] . --- packages/poml/next/lexer.ts | 48 ++++++++++------- packages/poml/next/nodes.ts | 47 +++++++++++++++-- packages/poml/tests/reader/lexer.test.ts | 66 +++++++++++++++++------- 3 files changed, 122 insertions(+), 39 deletions(-) diff --git a/packages/poml/next/lexer.ts b/packages/poml/next/lexer.ts index 9d58174b..f712ef93 100644 --- a/packages/poml/next/lexer.ts +++ b/packages/poml/next/lexer.ts @@ -1,8 +1,8 @@ import { createToken, Lexer } from 'chevrotain'; // Define token types for extended POML -export const CommentOpen = createToken({ name: 'CommentOpen', pattern: // }); +export const CommentOpen = createToken({ name: 'CommentOpen', pattern: // }); export const Pragma = createToken({ name: 'Pragma', pattern: /\b@pragma\b/i }); export const TemplateOpen = createToken({ name: 'TemplateOpen', pattern: /{{/ }); export const TemplateClose = createToken({ name: 'TemplateClose', pattern: /}}/ }); @@ -20,16 +20,17 @@ export const Backslash = createToken({ name: 'Backslash', pattern: /\\/ }); /* Identifier is one of the following: * - XML tag names * - XML attribute names - * - TextContent incorrectly parsed as identifiers + * - Arbitrary text content incorrectly parsed as identifiers * * Notes: * 1. In case 1, tags can contain : (namespaces) and . (extensions). * These are handled later by CST parser. * 2. In case 3, CST parser will reclassify as TextContent if needed. + * 3. We are going to disallow "." and ":" to appear in XML tags. */ export const Identifier = createToken({ name: 'Identifier', - pattern: /[a-zA-Z_][a-zA-Z0-9_\-]*/, + pattern: /[a-zA-Z_]([a-zA-Z0-9_]|(-(?!\-+>)))*/, }); export const Whitespace = createToken({ @@ -38,24 +39,35 @@ export const Whitespace = createToken({ line_breaks: true, }); - -/* Catch-all for arbitrary text content - - Match any char except: - < — starts a tag - {{ or }} — template delimiters - " or ' — start/end of string literals - - Single { or } are OK because they are not followed by another brace -*/ -export const TextContent = createToken({ - name: 'TextContent', - pattern: /(?:[^<"'{}]|{(?!{)|}(?!}))+/, +/* Catch-all for arbitrary text content. + * Match any char except the patterns from other tokens: + * - starts or ends a tag: <, >, + * - starts or ends a comment: + * - starts or ends a template: {{, }} + * - starts or ends a string literal: " or ' + * - whitespace (handled separately) + * - equal sign (=) + * - backslash \ (allowed for escaping in strings) + * + * Allowed: + * - Single { or } are OK if they are not followed by another brace + * - Incomplete tag delimiters such as / (/< is an exception, because < is a start of tag) + * - Incomplete comment delimiters such as !-- or -- are OK + * - Incorrect @pragma directive such as @pragm or @pragmaX will be matched + */ +export const Arbitrary = createToken({ + name: 'Arbitrary', + // Anything except: <, >, quotes, braces (double-brace protected), whitespace, =, backslash. + // Also allow a lone '/' but *not* when it starts '/>' (so TagSelfClose can win). + pattern: /(?:[^<>"'{}\s=\\\/-]|{(?!{)|}(?!})|\/(?!>)|\-(?!\-+>))+/, line_breaks: true, }); - // Define token order - more specific patterns first export const allTokens = [ - Comment, + CommentOpen, + CommentClose, + Pragma, TemplateOpen, TemplateClose, TagClosingOpen, // Must come before TagOpen @@ -68,7 +80,7 @@ export const allTokens = [ Backslash, Identifier, Whitespace, - TextContent, + Arbitrary, ]; // Extended POML Lexer class diff --git a/packages/poml/next/nodes.ts b/packages/poml/next/nodes.ts index 567bdc9d..bd9c6ecd 100644 --- a/packages/poml/next/nodes.ts +++ b/packages/poml/next/nodes.ts @@ -1,5 +1,5 @@ import { Range } from './types'; -import { IToken } from 'chevrotain'; +import { CstNode, IToken } from 'chevrotain'; /** * Represents a JavaScript expression as a string. @@ -10,9 +10,7 @@ import { IToken } from 'chevrotain'; * * Cases that apply: * - Conditional expressions: `i > 0`, `user.name === "admin"` - * - Collection accessors: `items.everything`, `data[0].value` * - Function calls: `formatDate(now)`, `items.filter(x => x.active)` - * - Property paths: `user.profile.settings.theme` * * Cases that do not apply: * - Template syntax including braces: `{{ expression }}` (use TemplateNode) @@ -301,6 +299,8 @@ export interface CommentNode { value: StringNode; } +// export interface Comment + /** * Represents a pragma directive carried inside a comment. * @@ -320,6 +320,14 @@ export interface PragmaNode { value: StringNode; } +export interface PragmaCstNode extends CstNode { + children: { + CommentOpenTag?: IToken[]; + CommentCloseTag?: IToken[]; + PragmaSymbol?: IToken[]; + }; +} + /** * Represents an element that preserves literal content. * @@ -351,6 +359,19 @@ export interface LiteralNode { children: StringNode; } +/** + * Related CST node interfaces for parsing stage. + */ +export interface LiteralElementCstNode extends CstNode { + children: { + OpenTag?: OpenTagCstNode[]; + CloseTag?: CloseTagCstNode[]; + // All content between open and close tags is treated as literal text + // including other tags, comments, pragmas, etc. + TextContent?: IToken[]; + }; +} + /** * Represents the root node of a POML document tree. * @@ -372,6 +393,26 @@ export interface RootNode { children: (ElementNode | LiteralNode | CommentNode | PragmaNode | ValueNode)[]; } +/** + * Related CST node interfaces for parsing stage. + */ +export interface RootCstNode extends CstNode { + children: { + Content?: ElementContentCstNode[]; + }; +} + +export interface ElementContentCstNode extends CstNode { + children: { + Element?: ElementCstNode; + LiteralElement?: LiteralElementCstNode; + SelfCloseElement?: SelfCloseElementCstNode; + Comment?: CommentCstNode; + Pragma?: PragmaCstNode; + Value?: ElementValueCstNode; + }; +} + // Keep these keys required; everything else becomes recursively optional type DeepPartialExcept = // arrays diff --git a/packages/poml/tests/reader/lexer.test.ts b/packages/poml/tests/reader/lexer.test.ts index 432c06ee..1c73bde0 100644 --- a/packages/poml/tests/reader/lexer.test.ts +++ b/packages/poml/tests/reader/lexer.test.ts @@ -1,7 +1,8 @@ import { describe, expect, test } from '@jest/globals'; import { extendedPomlLexer, - Comment, + CommentOpen, + CommentClose, TemplateOpen, TemplateClose, TagOpen, @@ -14,7 +15,7 @@ import { Backslash, Identifier, Whitespace, - TextContent, + Arbitrary, } from 'poml/next/lexer'; // Helper function to extract token images @@ -262,12 +263,37 @@ describe('Token Types', () => { }); test('should identify comments', () => { - expect(tokenTypes('')).toEqual([Comment]); + expect(tokenTypes('')).toEqual([CommentOpen, Whitespace, Identifier, Whitespace, CommentClose]); }); test('should identify whitespace', () => { expect(tokenTypes(' \t\n ')).toEqual([Whitespace]); }); + + test('should identify attributes', () => { + expect(tokenTypes('')).toEqual([ + TagOpen, + Identifier, + Arbitrary, + Whitespace, + Identifier, + Equals, + DoubleQuote, + Identifier, + DoubleQuote, + Whitespace, + Identifier, + Equals, + DoubleQuote, + Arbitrary, + Backslash, + Identifier, + DoubleQuote, + Arbitrary, + SingleQuote, + TagClose, + ]); + }); }); describe('Source Position and Error Tests', () => { @@ -436,7 +462,7 @@ describe('Boundary Conditions', () => { const longComment = ``; const commentResult = tokenize(longComment); expect(commentResult.errors).toHaveLength(0); - expect(commentResult.tokens).toHaveLength(1); + expect(commentResult.tokens).toHaveLength(3); const longIdentifier = 'a' + 'b'.repeat(10000); const identifierResult = tokenize(longIdentifier); @@ -548,14 +574,14 @@ describe('Malformed Patterns', () => { test('should handle broken template syntax', () => { expect(tokenImages('}')).toEqual(['}']); expect(tokenImages('}}')).toEqual(['}}']); - expect(tokenImages('{ single brace }')).toEqual(['{ single brace }']); - expect(tokenImages('{not a template}')).toEqual(['{not a template}']); + expect(tokenImages('{ single brace }')).toEqual(['{', ' ', 'single', ' ', 'brace', ' ', '}']); + expect(tokenImages('{not a template}')).toEqual(['{not', ' ', 'a', ' ', 'template', '}']); }); test('should handle nested malformed patterns', () => { - expect(tokenImages('')).toEqual(['']); - expect(tokenImages('')).toEqual(['']); - expect(tokenImages('')).toEqual(['']); + expect(tokenImages('')).toEqual(['']); + expect(tokenImages('more{{ content')).toEqual([ - '', + expect(tokenImages('more{{ content')).toEqual([ + '', '<', 'tag', '>', @@ -647,13 +677,13 @@ describe('Malformed Patterns', () => { expect(tokenImages('text{more}text')).toEqual(['text', '{more}text']); expect(tokenImages('before}after')).toEqual(['before', '}after']); expect(tokenImages('before{after')).toEqual(['before', '{after']); - expect(tokenImages('text } { more')).toEqual(['text', ' ', '} { more']); + expect(tokenImages('text } { more')).toEqual(['text', ' ', '}', ' ', '{', ' ', 'more']); }); test('should handle greedy vs non-greedy matching', () => { - expect(tokenImages('')).toEqual(['', '']); + expect(tokenImages('')).toEqual(['', '']); expect(tokenImages('{{first}}{{second}}')).toEqual(['{{', 'first', '}}', '{{', 'second', '}}']); - expect(tokenImages('textmore')).toEqual(['text', '', 'more']); + expect(tokenImages('textmore')).toEqual(['text', '', 'more']); }); }); @@ -740,11 +770,11 @@ comment --> more text`; const result = tokenize(input); - const commentToken = result.tokens.find((t) => t.tokenType.name === 'Comment'); + const commentToken = result.tokens.find((t) => t.tokenType.name === 'CommentOpen'); expect(commentToken).toBeDefined(); expect(commentToken!.startLine).toBe(2); - expect(commentToken!.endLine).toBe(4); + expect(commentToken!.endLine).toBe(2); }); test('should handle position tracking with carriage returns', () => { @@ -783,7 +813,7 @@ describe('Performance and Stress Tests', () => { const end = performance.now(); expect(result.errors).toHaveLength(0); - expect(result.tokens).toHaveLength(1); + expect(result.tokens).toHaveLength(3); expect(end - start).toBeLessThan(500); // Should be fast }); From b070c727d4c1c5137e6d74c1ce58fd85b4ca63b4 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Sat, 30 Aug 2025 18:54:19 +0800 Subject: [PATCH 29/76] . --- packages/poml/next/lexer.ts | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/packages/poml/next/lexer.ts b/packages/poml/next/lexer.ts index f712ef93..4f362a6a 100644 --- a/packages/poml/next/lexer.ts +++ b/packages/poml/next/lexer.ts @@ -33,9 +33,10 @@ export const Identifier = createToken({ pattern: /[a-zA-Z_]([a-zA-Z0-9_]|(-(?!\-+>)))*/, }); +// Include all Unicode whitespace characters and control characters export const Whitespace = createToken({ name: 'Whitespace', - pattern: /[ \t\r\n]+/, + pattern: /[\s\u0000-\u001F\u007F-\u009F\u2000-\u200B\uFEFF]+/, line_breaks: true, }); @@ -45,22 +46,23 @@ export const Whitespace = createToken({ * - starts or ends a comment: * - starts or ends a template: {{, }} * - starts or ends a string literal: " or ' - * - whitespace (handled separately) + * - whitespace (handled separately - includes Unicode whitespace and control chars) * - equal sign (=) - * - backslash \ (allowed for escaping in strings) + * - backslash \ (handled separately for escaping) * * Allowed: * - Single { or } are OK if they are not followed by another brace * - Incomplete tag delimiters such as / (/< is an exception, because < is a start of tag) * - Incomplete comment delimiters such as !-- or -- are OK * - Incorrect @pragma directive such as @pragm or @pragmaX will be matched + * - All other Unicode characters including emojis, CJK, etc. */ export const Arbitrary = createToken({ name: 'Arbitrary', - // Anything except: <, >, quotes, braces (double-brace protected), whitespace, =, backslash. - // Also allow a lone '/' but *not* when it starts '/>' (so TagSelfClose can win). - pattern: /(?:[^<>"'{}\s=\\\/-]|{(?!{)|}(?!})|\/(?!>)|\-(?!\-+>))+/, - line_breaks: true, + // Match anything except: <, >, quotes, =, backslash, whitespace (including Unicode), control chars + // Allow single braces and slashes with lookahead constraints + pattern: /(?:[^<>"'{}=\\\s\u0000-\u001F\u007F-\u009F\u2000-\u200B\uFEFF\/-]|{(?!{)|}(?!})|\/(?!>)|\-(?!\-+>))+/, + line_breaks: false, }); // Define token order - more specific patterns first From f03315de8ef41005a7e90e33f02a43179e67ce2b Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Sat, 30 Aug 2025 19:09:53 +0800 Subject: [PATCH 30/76] . --- packages/poml/next/lexer.ts | 2 +- packages/poml/tests/reader/lexer.test.ts | 33 +++++++++++++++--------- 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/packages/poml/next/lexer.ts b/packages/poml/next/lexer.ts index 4f362a6a..216f6e9e 100644 --- a/packages/poml/next/lexer.ts +++ b/packages/poml/next/lexer.ts @@ -1,7 +1,7 @@ import { createToken, Lexer } from 'chevrotain'; // Define token types for extended POML -export const CommentOpen = createToken({ name: 'CommentOpen', pattern: /))*/ }); export const CommentClose = createToken({ name: 'CommentClose', pattern: /-{2,}>/ }); export const Pragma = createToken({ name: 'Pragma', pattern: /\b@pragma\b/i }); export const TemplateOpen = createToken({ name: 'TemplateOpen', pattern: /{{/ }); diff --git a/packages/poml/tests/reader/lexer.test.ts b/packages/poml/tests/reader/lexer.test.ts index 1c73bde0..ff2e940d 100644 --- a/packages/poml/tests/reader/lexer.test.ts +++ b/packages/poml/tests/reader/lexer.test.ts @@ -37,7 +37,7 @@ function tokenize(input: string) { describe('Basic Token Images', () => { test('should tokenize HTML comments', () => { - expect(tokenImages('')).toEqual(['']); + expect(tokenImages('')).toEqual(['']); }); test('should tokenize template variables', () => { @@ -114,15 +114,18 @@ describe('Edge Cases', () => { test('chinese characters', () => { expect(tokenImages('中文 {{ 文本 }}内容< 标签>')).toEqual([ - '中文 ', + '中文', + ' ', '{{', ' ', - '文本 ', + '文本', + ' ', '}}', '内容', '<', ' ', - '标签>', + '标签', + '>', ]); }); @@ -376,7 +379,9 @@ Analyze data {{variable}}`; const images = tokenImages(input); - expect(images).toContain('# My Analysis\n\n'); + expect(images).toContain('#'); + expect(images).toContain('My'); + expect(images).toContain('Analysis'); expect(images).toContain('<'); expect(images).toContain('task'); expect(images).toContain('>'); @@ -387,7 +392,11 @@ Analyze data test('should handle comments with mixed content', () => { expect(tokenImages('content')).toEqual([ - '', + '', '<', 'task', '>', @@ -446,7 +455,7 @@ describe('Boundary Conditions', () => { }); test('should handle minimum valid patterns', () => { - expect(tokenImages('')).toEqual(['']); + expect(tokenImages('')).toEqual(['']); expect(tokenImages('
    ')).toEqual(['<', 'a', '>']); expect(tokenImages('')).toEqual(['']); expect(tokenImages('')).toEqual(['<', 'a', '/>']); @@ -509,13 +518,13 @@ describe('Unicode and Special Characters', () => { }); test('should handle emoji and symbols', () => { - expect(tokenImages('Hello 👋 World 🌍')).toEqual(['Hello', ' ', '👋 World 🌍']); - expect(tokenImages('Math: ∑∞π≠∅')).toEqual(['Math', ': ∑∞π≠∅']); - expect(tokenImages('Arrows: ←→↑↓')).toEqual(['Arrows', ': ←→↑↓']); + expect(tokenImages('Hello 👋 World 🌍')).toEqual(['Hello', ' ', '👋', ' ', 'World', ' ', '🌍']); + expect(tokenImages('Math: ∑∞π≠∅')).toEqual(['Math', ':', ' ', '∑∞π≠∅']); + expect(tokenImages('Arrows: ←→↑↓')).toEqual(['Arrows', ':', ' ', '←→↑↓']); }); test('should handle unicode', () => { - expect(tokenImages('<こんにちは>')).toEqual(['<', 'こんにちは>']); + expect(tokenImages('<こんにちは>')).toEqual(['<', 'こんにちは', '>']); expect(tokenImages('{{你好}}')).toEqual(['{{', '你好', '}}']); expect(tokenImages('')).toEqual(['<', 'tag', ' ', 'attr', '=', '"', 'caf', 'é', '"', '>']); }); @@ -550,7 +559,7 @@ describe('Malformed Patterns', () => { test('should handle incomplete template variables', () => { expect(tokenImages('text {{')).toEqual(['text', ' ', '{{']); expect(tokenImages('text {{variable')).toEqual(['text', ' ', '{{', 'variable']); - expect(tokenImages('{{ var }{ not closed')).toEqual(['{{', ' ', 'var', ' ', '}{ not closed']); + expect(tokenImages('{{ var }{ not closed')).toEqual(['{{', ' ', 'var', ' ', '}{', ' ', 'not', ' ', 'closed']); expect(tokenImages('{{nested {{inside')).toEqual(['{{', 'nested', ' ', '{{', 'inside']); }); From 675ae8928b8a1740220caba6b5d032a2b8eceabd Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Sat, 30 Aug 2025 19:18:10 +0800 Subject: [PATCH 31/76] . --- packages/poml/tests/reader/lexer.test.ts | 62 ++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/packages/poml/tests/reader/lexer.test.ts b/packages/poml/tests/reader/lexer.test.ts index ff2e940d..93c60818 100644 --- a/packages/poml/tests/reader/lexer.test.ts +++ b/packages/poml/tests/reader/lexer.test.ts @@ -694,6 +694,68 @@ describe('Malformed Patterns', () => { expect(tokenImages('{{first}}{{second}}')).toEqual(['{{', 'first', '}}', '{{', 'second', '}}']); expect(tokenImages('textmore')).toEqual(['text', '', 'more']); }); + + test('should handle single braces correctly', () => { + // Single { or } are OK if not followed by another brace + expect(tokenImages('text { more text')).toEqual(['text', ' ', '{', ' ', 'more', ' ', 'text']); + expect(tokenImages('text } more text')).toEqual(['text', ' ', '}', ' ', 'more', ' ', 'text']); + expect(tokenImages('a{b}c')).toEqual(['a', '{b}c']); + expect(tokenImages('path{index}')).toEqual(['path', '{index}']); + expect(tokenImages('array[{key}]')).toEqual(['array', '[{key}]']); + expect(tokenImages('{ not {{ double')).toEqual(['{', ' ', 'not', ' ', '{{', ' ', 'double']); + expect(tokenImages('} not }} double')).toEqual(['}', ' ', 'not', ' ', '}}', ' ', 'double']); + expect(tokenImages('{}empty{}')).toEqual(['{}empty{}']); + expect(tokenImages('}{reversed}{')).toEqual(['}{reversed}{']); + }); + + test('should handle incomplete tag delimiters', () => { + // Incomplete tag delimiters such as / (except /< and />) + expect(tokenImages('path/to/file')).toEqual(['path', '/to/file']); + expect(tokenImages('a/b/c')).toEqual(['a', '/b/c']); + expect(tokenImages('text / more')).toEqual(['text', ' ', '/', ' ', 'more']); + expect(tokenImages('http://example.com')).toEqual(['http', '://example.com']); + expect(tokenImages('5/3=1.67')).toEqual(['5/3', '=', '1.67']); + // These should NOT match as incomplete delimiters + expect(tokenImages('/')).toEqual(['/', '<', 'tag', '>']); + expect(tokenImages('/>')).toEqual(['/>']); + expect(tokenImages('')).toEqual(['']); + }); + + test('should handle incomplete comment delimiters', () => { + // Incomplete comment delimiters such as !-- or -- are OK + expect(tokenImages('text !-- not comment')).toEqual(['text', ' ', '!--', ' ', 'not', ' ', 'comment']); + expect(tokenImages('text -- also not')).toEqual(['text', ' ', '--', ' ', 'also', ' ', 'not']); + expect(tokenImages('a--b')).toEqual(['a--b']); + expect(tokenImages('!--incomplete')).toEqual(['!--incomplete']); + expect(tokenImages('--dashes--')).toEqual(['--dashes--']); + expect(tokenImages('')).toEqual([ + '', + ]); + expect(tokenImages('not')).toEqual(['not', '']); + expect(tokenImages('---triple-dash')).toEqual(['---triple-dash']); + expect(tokenImages('text --- more')).toEqual(['text', ' ', '---', ' ', 'more']); + }); + + test('should handle incorrect @pragma directives', () => { + // Incorrect @pragma directive such as @pragm or @pragmaX will be matched as Arbitrary + expect(tokenImages('@pragma')).toEqual(['@pragma']); + expect(tokenImages('@pragm')).toEqual(['@pragm']); + expect(tokenImages('@pragmaX')).toEqual(['@pragmaX']); + expect(tokenImages('@pragma-extended')).toEqual(['@pragma-extended']); + expect(tokenImages('@@pragma')).toEqual(['@@pragma']); + expect(tokenImages('not@pragma')).toEqual(['not', '@pragma']); + expect(tokenImages('@PRAGMA')).toEqual(['@PRAGMA']); + expect(tokenImages('@Pragma')).toEqual(['@Pragma']); + expect(tokenImages('@pragma key=value')).toEqual(['@pragma', ' ', 'key', '=', 'value']); + }); }); describe('Position Tracking Accuracy', () => { From f02c73570f73f798135d5e2dd9ae425fc3a891f3 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Sun, 31 Aug 2025 00:34:12 +0800 Subject: [PATCH 32/76] . --- packages/poml/next/nodes.ts | 119 +++++++++++++++++++---- packages/poml/tests/reader/lexer.test.ts | 7 ++ 2 files changed, 109 insertions(+), 17 deletions(-) diff --git a/packages/poml/next/nodes.ts b/packages/poml/next/nodes.ts index bd9c6ecd..389bf2dd 100644 --- a/packages/poml/next/nodes.ts +++ b/packages/poml/next/nodes.ts @@ -184,6 +184,23 @@ export interface ForLoopAttributeNode { value: ForIteratorNode; } +/** + * Related CST node interfaces for parsing stage. + */ +export interface ForLoopAttributeCstNode extends CstNode { + children: { + AttributeKey?: IToken[]; + WsAfterKey?: IToken[]; + Equals?: IToken[]; + WsAfterEquals?: IToken[]; + OpenQuote?: IToken[]; + WsAfterOpenQuote?: IToken[]; + ForIterator?: ForIteratorCstNode[]; + WsBeforeCloseQuote?: IToken[]; + CloseQuote?: IToken[]; + }; +} + /** * Represents an opening tag in POML markup. * @@ -206,10 +223,25 @@ export interface ForLoopAttributeNode { export interface OpenTagNode { kind: 'OPEN'; range: Range; - value: StringNode; + value: StringNode; // tag name attributes: (AttributeNode | ForLoopAttributeNode)[]; } +/** + * Related CST node interfaces for parsing stage. + */ +export interface OpenTagCstNode extends CstNode { + children: { + OpenBracket?: IToken[]; + WsAfterBracket?: IToken[]; + TagName?: IToken[]; + WsAfterName?: IToken[]; + Attribute?: AttributeCstNode[]; + WsAfterAttribute?: IToken[]; + CloseBracket?: IToken[]; + }; +} + /** * Represents a closing tag in POML markup. * @@ -229,7 +261,19 @@ export interface OpenTagNode { export interface CloseTagNode { kind: 'CLOSE'; range: Range; - value: StringNode; + value: StringNode; // tag name +} + +/** + * Related CST node interfaces for parsing stage. + */ +export interface CloseTagCstNode extends CstNode { + children: { + ClosingOpenBracket?: IToken[]; + WsAfterBracket?: IToken[]; + TagName?: IToken[]; + CloseBracket?: IToken[]; + }; } /** @@ -252,10 +296,25 @@ export interface CloseTagNode { export interface SelfCloseElementNode { kind: 'SELFCLOSE'; range: Range; - value: StringNode; + value: StringNode; // tag name attributes: (AttributeNode | ForLoopAttributeNode)[]; } +/** + * Related CST node interfaces for parsing stage. + */ +export interface SelfCloseElementCstNode extends CstNode { + children: { + OpenBracket?: IToken[]; + WsAfterBracket?: IToken[]; + TagName?: IToken[]; + WsAfterName?: IToken[]; + Attribute?: AttributeCstNode[]; + WsAfterAttribute?: IToken[]; + SelfCloseBracket?: IToken[]; + }; +} + /** * Represents a complete POML element with its content. * @@ -283,6 +342,28 @@ export interface ElementNode { children: (ElementNode | LiteralNode | CommentNode | PragmaNode | ValueNode)[]; } +/** + * Related CST node interfaces for parsing stage. + */ +export interface ElementCstNode extends CstNode { + children: { + OpenTag?: OpenTagCstNode[]; + CloseTag?: CloseTagCstNode[]; + Content?: IToken[]; + }; +} + +export interface ElementContentCstNode extends CstNode { + children: { + Element?: ElementCstNode; + LiteralElement?: LiteralElementCstNode; + SelfCloseElement?: SelfCloseElementCstNode; + Comment?: CommentCstNode; + Pragma?: PragmaCstNode; + Value?: ElementValueCstNode; + }; +} + /** * Represents an HTML-like line/block comment in POML. * @@ -299,7 +380,16 @@ export interface CommentNode { value: StringNode; } -// export interface Comment +/** + * Related CST node interfaces for parsing stage. + */ +export interface CommentCstNode extends CstNode { + children: { + CommentOpenTag?: IToken[]; + CommentContent?: IToken[]; + CommentCloseTag?: IToken[]; + }; +} /** * Represents a pragma directive carried inside a comment. @@ -320,11 +410,18 @@ export interface PragmaNode { value: StringNode; } +/** + * Related CST node interfaces for parsing stage. + */ export interface PragmaCstNode extends CstNode { children: { CommentOpenTag?: IToken[]; + WsAfterOpen?: IToken[]; + PragmaKeyword?: IToken[]; + WsAfterPragma?: IToken[]; + CommentContent?: IToken[]; + WsAfterContent?: IToken[]; CommentCloseTag?: IToken[]; - PragmaSymbol?: IToken[]; }; } @@ -355,7 +452,6 @@ export interface LiteralNode { range: Range; open: OpenTagNode; close: CloseTagNode; - attributes: AttributeNode[]; children: StringNode; } @@ -402,17 +498,6 @@ export interface RootCstNode extends CstNode { }; } -export interface ElementContentCstNode extends CstNode { - children: { - Element?: ElementCstNode; - LiteralElement?: LiteralElementCstNode; - SelfCloseElement?: SelfCloseElementCstNode; - Comment?: CommentCstNode; - Pragma?: PragmaCstNode; - Value?: ElementValueCstNode; - }; -} - // Keep these keys required; everything else becomes recursively optional type DeepPartialExcept = // arrays diff --git a/packages/poml/tests/reader/lexer.test.ts b/packages/poml/tests/reader/lexer.test.ts index 93c60818..11d39d78 100644 --- a/packages/poml/tests/reader/lexer.test.ts +++ b/packages/poml/tests/reader/lexer.test.ts @@ -756,6 +756,13 @@ describe('Malformed Patterns', () => { expect(tokenImages('@Pragma')).toEqual(['@Pragma']); expect(tokenImages('@pragma key=value')).toEqual(['@pragma', ' ', 'key', '=', 'value']); }); + + test('should handle ', () => { + expect(tokenImages('')).toEqual(['']); + expect(tokenImages('< />')).toEqual(['<', ' ', '/>']); + expect(tokenImages('< / >')).toEqual(['<', ' ', '/', ' ', '>']); + expect(tokenImages('')).toEqual(['']); + }); }); describe('Position Tracking Accuracy', () => { From 9ff1f7602ccf9186e1bfd2199d7b81efd32f888b Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Sun, 31 Aug 2025 09:42:59 +0800 Subject: [PATCH 33/76] finish cst nodes --- packages/poml/next/lexer.ts | 5 + packages/poml/next/nodes.ts | 212 ++++++++++++++++++++++-------------- 2 files changed, 133 insertions(+), 84 deletions(-) diff --git a/packages/poml/next/lexer.ts b/packages/poml/next/lexer.ts index 216f6e9e..0591d786 100644 --- a/packages/poml/next/lexer.ts +++ b/packages/poml/next/lexer.ts @@ -16,6 +16,11 @@ export const Equals = createToken({ name: 'Equals', pattern: /=/ }); export const DoubleQuote = createToken({ name: 'DoubleQuote', pattern: /"/ }); export const SingleQuote = createToken({ name: 'SingleQuote', pattern: /'/ }); export const Backslash = createToken({ name: 'Backslash', pattern: /\\/ }); +export const BackslashEscape = createToken({ + name: 'BackslashEscape', + pattern: /\\(n|r|t|'|"|{{|}}|\\|x[0-9A-Fa-f]{2}|u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})/, +}); +export const CharacterEntity = createToken({ name: 'CharacterEntity', pattern: /&#[0-9]+;|&[a-zA-Z][a-zA-Z0-9]+;/ }); /* Identifier is one of the following: * - XML tag names diff --git a/packages/poml/next/nodes.ts b/packages/poml/next/nodes.ts index 389bf2dd..b2c586b9 100644 --- a/packages/poml/next/nodes.ts +++ b/packages/poml/next/nodes.ts @@ -14,7 +14,7 @@ import { CstNode, IToken } from 'chevrotain'; * * Cases that do not apply: * - Template syntax including braces: `{{ expression }}` (use TemplateNode) - * - String literals with quotes: `"hello"` (use StringNode or ValueNode) + * - String literals with quotes: `"hello"` (use LiteralNode or ValueNode) * - POML markup: `` (use element nodes) */ export interface ExpressionNode { @@ -23,8 +23,6 @@ export interface ExpressionNode { value: string; } -export interface ExpressionCstNode {} - /** * Represents a template interpolation with double curly braces, * or sometimes without braces in specific attributes. @@ -42,9 +40,9 @@ export interface ExpressionCstNode {} * * Cases that do not apply: * - Full attribute expressions: `if="x > 0"` (use ExpressionNode) - * - Plain text: `Hello World` (use StringNode) + * - Plain text: `Hello World` (use LiteralNode) * - Single braces: `{ not a template }` (treated as plain text) - * - Template elements: (use LiteralNode) + * - Template elements: (use ElementNode) * - With quotes: `"{{ var }}"` (use ValueNode) */ export interface TemplateNode { @@ -53,10 +51,28 @@ export interface TemplateNode { value: ExpressionNode; } +/** + * Related CST node interfaces for parsing stage. + */ + +export interface CstTemplateNode extends CstNode { + children: { + OpenTemplate?: IToken[]; + WsAfterOpen?: IToken[]; + // Content inside {{ and }} is treated as a single expression token. + // Eats everything until the next }} (or the whitespace before it). + // Handles \{{ and \}} escapes. We won't escape other chars here. + Content?: IToken[]; + // If it's close to the ending }}, try to eat whitespace before it. + WsAfterContent?: IToken[]; + CloseTemplate?: IToken[]; + }; +} + /** * Represents plain text content without any special syntax. * - * String nodes are the most basic content nodes, containing literal text + * Literal nodes are the most basic content nodes, containing literal text * that requires no processing. They are used both for content and as * components of other nodes (like attribute keys and tag names). * @@ -74,7 +90,7 @@ export interface TemplateNode { * - Expressions: `x > 0` (use ExpressionNode) * - Template variables: `{{ var }}` (use TemplateNode) */ -export interface StringNode { +export interface LiteralNode { kind: 'STRING'; range: Range; value: string; @@ -95,9 +111,9 @@ export interface StringNode { * - Multi-part content: `"Price: ${{amount}} USD"` * * Cases that do not apply: - * - Attribute keys: `class=...` (the `class` part uses StringNode) + * - Attribute keys: `class=...` (the `class` part uses LiteralNode) * - Pure expressions without quotes: `if=condition` (use ExpressionNode) - * - Tag names: `div` (use StringNode) + * - Tag names: `div` (use LiteralNode) * - Standalone template variables not in a value context * * Note: The range includes quotes if present, but children exclude them. @@ -105,7 +121,29 @@ export interface StringNode { export interface ValueNode { kind: 'VALUE'; range: Range; - children: (StringNode | TemplateNode)[]; + children: (LiteralNode | TemplateNode)[]; +} + +/** + * Related CST node interfaces for parsing stage. + * The following two interfaces are for quoted strings and will be transformed into ValueNode. + */ +export interface CstQuotedNode extends CstNode { + children: { + OpenQuote?: IToken[]; + // This is a normal quoted string without templates inside. + Content?: IToken[]; + CloseQuote?: IToken[]; + }; +} + +export interface CstQuotedTemplateNode extends CstNode { + children: { + OpenQuote?: IToken[]; + // Allows "Hello {{ friend["abc"] }}!" - mix of text and templates (with quotes). + Content?: (IToken | CstTemplateNode)[]; + CloseQuote?: IToken[]; + }; } /** @@ -116,13 +154,14 @@ export interface ValueNode { * and the collection expression for runtime evaluation. * * Cases that apply: - * - Simple iteration: `item in items` - * - Property access: `user in data.users` - * - Array literals: `num in [1, 2, 3]` - * - Method calls: `result in getResults()` - * - Nested property iteration: `task in project.tasks.active` + * - Simple iteration: `"item in items"` + * - Property access: `"user in data.users"` + * - Array literals: `"num in [1, 2, 3]"` + * - Method calls in single quotes: `'result in getResults()'` + * - Nested property iteration: `'task in project.tasks.active'` * * Cases that do not apply (not yet supported): + * - Without quotes: `item in items` (must be in quotes for now) * - Advanced loop syntax (not yet supported): `(item, index) in items` * - Destructuring patterns (not yet supported): `{name, age} in users` * - Conditional loops: `if` attributes (use separate condition handling) @@ -131,10 +170,31 @@ export interface ValueNode { export interface ForIteratorNode { kind: 'FORITERATOR'; range: Range; - iterator: StringNode; + iterator: LiteralNode; collection: ExpressionNode; } +/** + * Related CST node interfaces for parsing stage. + */ +export interface CstForIteratorNode extends CstNode { + children: { + OpenQuote?: IToken[]; + WsAfterOpen?: IToken[]; + Iterator?: IToken[]; + WsAfterIterator?: IToken[]; + InKeyword?: IToken[]; + WsAfterIn?: IToken[]; + // Follows the same parsing rules as template expression. + // But as we are in a quoted string, we need to handle + // backslash escapes like \" and \'. + // Greedily match until the next unescaped quote or ws before it. + Collection?: IToken[]; + WsAfterCollection?: IToken[]; + CloseQuote?: IToken[]; + }; +} + /** * Represents a standard attribute on a POML element. * @@ -142,62 +202,42 @@ export interface ForIteratorNode { * of a key-value pair where the key is always a simple string and the value * can be a complex composition of text and templates. * + * It also supports for-loop attributes via ForIterator, which contains + * loop iteration syntax rather than a simple value. It enables + * elements to be rendered multiple times based on a collection. + * * Cases that apply: * - Simple attributes: `class="container"`, `id='main'` * - Template values: `title="{{ pageTitle }}"` or `title={{ pageTitle }}` * - Mixed values: `placeholder="Enter {{ fieldName }}..."` + * - For attributes: `for="item in items"` (key is "for", value is ForIteratorNode) + * - Computed collections: `for='i in [...Array(5).keys()]'` * * Cases that do not apply: * - Boolean/presence attributes: `disabled`, `checked` (not yet supported) - * - For-loop attributes: `for="item in items"` (use ForLoopAttributeNode) * - Spread attributes (not yet supported): `{...props}` * - Dynamic attribute names (not supported): `[attrName]="value"` */ export interface AttributeNode { kind: 'ATTRIBUTE'; range: Range; - key: StringNode; - value: ValueNode; -} - -/** - * Represents a special for-loop attribute on POML elements. - * - * This specialized attribute node handles the `for` attribute specifically, - * which contains loop iteration syntax rather than a simple value. It enables - * elements to be rendered multiple times based on a collection. - * - * Cases that apply: - * - For attributes only: `for="item in items"` - * - Nested iterations: `for="subitem in item.children"` - * - Computed collections: `for="i in [...Array(5).keys()]"` - * - * Cases that do not apply: - * - Any attribute with a key other than "for" - * - Standard attributes: `class="..."` (use AttributeNode) - * - Conditional attributes: `if="..."` (use AttributeNode) - */ -export interface ForLoopAttributeNode { - kind: 'FORATTRIBUTE'; - range: Range; - key: StringNode; - value: ForIteratorNode; + key: LiteralNode; + value: ValueNode | ForIteratorNode; } /** * Related CST node interfaces for parsing stage. */ -export interface ForLoopAttributeCstNode extends CstNode { +export interface CstAttributeNode extends CstNode { children: { AttributeKey?: IToken[]; WsAfterKey?: IToken[]; Equals?: IToken[]; WsAfterEquals?: IToken[]; - OpenQuote?: IToken[]; - WsAfterOpenQuote?: IToken[]; - ForIterator?: ForIteratorCstNode[]; - WsBeforeCloseQuote?: IToken[]; - CloseQuote?: IToken[]; + // Choose between one: john="doe", john='doe', john={{ template }}, for="i in items" + quotedValue?: CstQuotedTemplateNode[]; + templatedValue?: CstTemplateNode[]; + forIteratorValue?: CstForIteratorNode[]; }; } @@ -223,8 +263,8 @@ export interface ForLoopAttributeCstNode extends CstNode { export interface OpenTagNode { kind: 'OPEN'; range: Range; - value: StringNode; // tag name - attributes: (AttributeNode | ForLoopAttributeNode)[]; + value: LiteralNode; // tag name + attributes: AttributeNode[]; } /** @@ -236,7 +276,7 @@ export interface OpenTagCstNode extends CstNode { WsAfterBracket?: IToken[]; TagName?: IToken[]; WsAfterName?: IToken[]; - Attribute?: AttributeCstNode[]; + Attribute?: CstAttributeNode[]; WsAfterAttribute?: IToken[]; CloseBracket?: IToken[]; }; @@ -261,7 +301,7 @@ export interface OpenTagCstNode extends CstNode { export interface CloseTagNode { kind: 'CLOSE'; range: Range; - value: StringNode; // tag name + value: LiteralNode; // tag name } /** @@ -296,20 +336,20 @@ export interface CloseTagCstNode extends CstNode { export interface SelfCloseElementNode { kind: 'SELFCLOSE'; range: Range; - value: StringNode; // tag name - attributes: (AttributeNode | ForLoopAttributeNode)[]; + value: LiteralNode; // tag name + attributes: AttributeNode[]; } /** * Related CST node interfaces for parsing stage. */ -export interface SelfCloseElementCstNode extends CstNode { +export interface CstSelfCloseElementNode extends CstNode { children: { OpenBracket?: IToken[]; WsAfterBracket?: IToken[]; TagName?: IToken[]; WsAfterName?: IToken[]; - Attribute?: AttributeCstNode[]; + Attribute?: CstAttributeNode[]; WsAfterAttribute?: IToken[]; SelfCloseBracket?: IToken[]; }; @@ -339,13 +379,13 @@ export interface ElementNode { range: Range; open: OpenTagNode; close: CloseTagNode; - children: (ElementNode | LiteralNode | CommentNode | PragmaNode | ValueNode)[]; + children: (ElementNode | LiteralElementNode | CommentNode | PragmaNode | ValueNode)[]; } /** * Related CST node interfaces for parsing stage. */ -export interface ElementCstNode extends CstNode { +export interface CstElementNode extends CstNode { children: { OpenTag?: OpenTagCstNode[]; CloseTag?: CloseTagCstNode[]; @@ -353,14 +393,15 @@ export interface ElementCstNode extends CstNode { }; } -export interface ElementContentCstNode extends CstNode { +export interface CstElementContentNode extends CstNode { children: { - Element?: ElementCstNode; - LiteralElement?: LiteralElementCstNode; - SelfCloseElement?: SelfCloseElementCstNode; - Comment?: CommentCstNode; - Pragma?: PragmaCstNode; - Value?: ElementValueCstNode; + Element?: CstElementNode[]; + LiteralElement?: CstLiteralElementNode[]; + SelfCloseElement?: CstSelfCloseElementNode[]; + Comment?: CstCommentNode[]; + Pragma?: CstPragmaNode[]; + Template?: CstTemplateNode[]; + TextContent?: IToken[]; }; } @@ -377,13 +418,13 @@ export interface ElementContentCstNode extends CstNode { export interface CommentNode { kind: 'COMMENT'; range: Range; - value: StringNode; + value: LiteralNode; } /** * Related CST node interfaces for parsing stage. */ -export interface CommentCstNode extends CstNode { +export interface CstCommentNode extends CstNode { children: { CommentOpenTag?: IToken[]; CommentContent?: IToken[]; @@ -407,19 +448,22 @@ export interface CommentCstNode extends CstNode { export interface PragmaNode { kind: 'PRAGMA'; range: Range; - value: StringNode; + identifier: LiteralNode; + options: LiteralNode[]; } /** * Related CST node interfaces for parsing stage. */ -export interface PragmaCstNode extends CstNode { +export interface CstPragmaNode extends CstNode { children: { CommentOpenTag?: IToken[]; WsAfterOpen?: IToken[]; PragmaKeyword?: IToken[]; WsAfterPragma?: IToken[]; - CommentContent?: IToken[]; + PragmaIdentifier?: IToken[]; + WsAfterIdentifier?: IToken[]; + PragmaOption?: (IToken | CstQuotedNode)[]; WsAfterContent?: IToken[]; CommentCloseTag?: IToken[]; }; @@ -428,7 +472,7 @@ export interface PragmaCstNode extends CstNode { /** * Represents an element that preserves literal content. * - * Literal nodes are special POML elements that treat their content as literal + * Literal element nodes are special POML elements that treat their content as literal * text, preventing template variable interpolation. They ensure content is * preserved exactly as written, useful for code samples or pre-formatted text. * When `` is used, the parser eats everything including tags and comments, @@ -444,27 +488,28 @@ export interface PragmaCstNode extends CstNode { * - Text with attributes enabling processing (future feature) * * Note: The tagName (value) can only be "text" in this version. - * Literal node is different from elements which do not support children. - * Literal node is handled on the CST parsing stage. + * Literal element node is different from elements which do not support nested tags, + * e.g., or is treated as raw text + this.MANY(() => { this.OR([ - // Continue consuming anything that is not the start of the matching close. { - GATE: () => { - if (this.LA(1).tokenType !== ClosingOpenBracket) { - return true; - } - // look ahead to see if it's or - let k = 2; - while (this.LA(k).tokenType === Whitespace) { - k++; - } - const t = this.LA(k); - if (t.tokenType !== Identifier) { - return true; - } - const name = (t.image || '').toLowerCase(); - return !(name === 'text' || name === 'template'); - }, - ALT: () => { - // Treat all as raw text content - this.OR( - anyOf( - AllTokens.filter((t) => t !== ClosingOpenBracket), // minimal guard - 'TextContent', - ), - ); - }, + GATE: () => !this.isAtLiteralClose(), + ALT: () => this.OR(this.anyOf(AllTokens, 'TextContent')), }, ]); }); @@ -432,19 +386,15 @@ export class ExtendedPomlParser extends CstParser { this.performSelfAnalysis(); } - // Expose entry for external callers (TypeScript-friendly) public parseRoot(): CstNode { - // @ts-expect-error Chevrotain types: RULE name maps to a function + // Invoke the entry rule (property is a function) return this.root(); } } -// Singleton parser instance +// Singleton parser export const extendedPomlParser = new ExtendedPomlParser(); -/** - * Convenience: tokenize + parse in one call. - */ export function parsePomlToCst(input: string): { cst: CstNode | undefined; lexErrors: ReturnType['errors']; From 038b6833cefb527c35fc7b0dc336ed78edc46db1 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Mon, 1 Sep 2025 19:17:47 +0800 Subject: [PATCH 44/76] . --- packages/poml/next/cst.ts | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/packages/poml/next/cst.ts b/packages/poml/next/cst.ts index 7737ec6e..5c40bedc 100644 --- a/packages/poml/next/cst.ts +++ b/packages/poml/next/cst.ts @@ -59,43 +59,43 @@ export class ExtendedPomlParser extends CstParser { private isNextPragma = () => { if (this.LA(1).tokenType !== CommentOpen) { -return false; -} + return false; + } let k = 2; while (this.LA(k).tokenType === Whitespace) { -k++; -} + k++; + } return this.LA(k).tokenType === PragmaKeyword; }; private isNextLiteralOpenTag = () => { if (this.LA(1).tokenType !== OpenBracket) { -return false; -} + return false; + } let k = 2; while (this.LA(k).tokenType === Whitespace) { -k++; -} + k++; + } const tName = this.LA(k); if (tName.tokenType !== Identifier) { -return false; -} + return false; + } const name = (tName.image || '').toLowerCase(); return name === 'text' || name === 'template'; }; private isAtLiteralClose = () => { if (this.LA(1).tokenType !== ClosingOpenBracket) { -return false; -} + return false; + } let k = 2; while (this.LA(k).tokenType === Whitespace) { -k++; -} + k++; + } const t = this.LA(k); if (t.tokenType !== Identifier) { -return false; -} + return false; + } const name = (t.image || '').toLowerCase(); return name === 'text' || name === 'template'; }; From 22827c995823218345efdc373d5d61cd472a204c Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Tue, 2 Sep 2025 00:01:08 +0800 Subject: [PATCH 45/76] . --- packages/poml/base.tsx | 5 +++- packages/poml/next/nodes.ts | 57 +++++++++++++++++-------------------- 2 files changed, 30 insertions(+), 32 deletions(-) diff --git a/packages/poml/base.tsx b/packages/poml/base.tsx index 4e8c875c..c894c625 100644 --- a/packages/poml/base.tsx +++ b/packages/poml/base.tsx @@ -137,7 +137,10 @@ export interface PropsBase { // Experimental writerOptions?: object; - whiteSpace?: 'pre' | 'filter' | 'trim'; + whiteSpace?: 'pre' | 'filter' | 'trim' | 'collapse'; + + // Enforce inline on every element. + inline?: boolean; /** Soft character limit before truncation is applied. */ charLimit?: number; diff --git a/packages/poml/next/nodes.ts b/packages/poml/next/nodes.ts index 6f793fc2..9884324f 100644 --- a/packages/poml/next/nodes.ts +++ b/packages/poml/next/nodes.ts @@ -1,6 +1,10 @@ import { Range } from './types'; import { CstNode, IToken } from 'chevrotain'; +export interface AstNode { + range: Range; // start and end offsets in the source text +} + /** * Represents a JavaScript expression as a string. * @@ -17,9 +21,8 @@ import { CstNode, IToken } from 'chevrotain'; * - String literals with quotes: `"hello"` (use LiteralNode or ValueNode) * - POML markup: `` (use element nodes) */ -export interface ExpressionNode { +export interface ExpressionNode extends AstNode { kind: 'EXPRESSION'; - range: Range; value: string; } @@ -45,9 +48,8 @@ export interface ExpressionNode { * - Template elements: (use LiteralNode) * - With quotes: `"{{ var }}"` (use ValueNode) */ -export interface TemplateNode { +export interface TemplateNode extends AstNode { kind: 'TEMPLATE'; - range: Range; value: ExpressionNode; } @@ -90,9 +92,8 @@ export interface CstTemplateNode extends CstNode { * - Expressions: `x > 0` (use ExpressionNode) * - Template variables: `{{ var }}` (use TemplateNode) */ -export interface LiteralNode { +export interface LiteralNode extends AstNode { kind: 'STRING'; - range: Range; value: string; } @@ -118,12 +119,16 @@ export interface LiteralNode { * * Note: The range includes quotes if present, but children exclude them. */ -export interface ValueNode { +export interface ValueNode extends AstNode { kind: 'VALUE'; - range: Range; children: (LiteralNode | TemplateNode)[]; } +export interface TextElementNode extends AstNode { + kind: 'TEXT'; + value: string; +} + /** * Related CST node interfaces for parsing stage. * The following two interfaces are for quoted strings and will be transformed into ValueNode. @@ -167,9 +172,8 @@ export interface CstQuotedTemplateNode extends CstNode { * - Conditional loops: `if` attributes (use separate condition handling) * - Template interpolation: `{{ items }}` (use TemplateNode) */ -export interface ForIteratorNode { +export interface ForIteratorNode extends AstNode { kind: 'FORITERATOR'; - range: Range; iterator: LiteralNode; collection: ExpressionNode; } @@ -218,9 +222,8 @@ export interface CstForIteratorNode extends CstNode { * - Spread attributes (not yet supported): `{...props}` * - Dynamic attribute names (not supported): `[attrName]="value"` */ -export interface AttributeNode { +export interface AttributeNode extends AstNode { kind: 'ATTRIBUTE'; - range: Range; key: LiteralNode; value: ValueNode | ForIteratorNode; } @@ -260,9 +263,8 @@ export interface CstAttributeNode extends CstNode { * - Complete elements: opening + content + closing (use ElementNode) * - Invalid or malformed tags (treated as text) */ -export interface OpenTagNode { +export interface OpenTagNode extends AstNode { kind: 'OPEN'; - range: Range; value: LiteralNode; // tag name attributes: AttributeNode[]; } @@ -298,9 +300,8 @@ export interface OpenTagCstNode extends CstNode { * - Self-closing tags: `
    ` (use SelfCloseTagNode) * - Tags with attributes (closing tags never have attributes) */ -export interface CloseTagNode { +export interface CloseTagNode extends AstNode { kind: 'CLOSE'; - range: Range; value: LiteralNode; // tag name } @@ -333,9 +334,8 @@ export interface CloseTagCstNode extends CstNode { * - Separate open/close tags: `
    ` (use ElementNode) * - Tags without the self-closing slash: `` (use OpenTagNode) */ -export interface SelfCloseElementNode { +export interface SelfCloseElementNode extends AstNode { kind: 'SELFCLOSE'; - range: Range; value: LiteralNode; // tag name attributes: AttributeNode[]; } @@ -374,9 +374,8 @@ export interface CstSelfCloseElementNode extends CstNode { * - Template variables: `{{ var }}` (use TemplateNode) * - Meta elements: `` tags (use MetaNode) */ -export interface ElementNode { +export interface ElementNode extends AstNode { kind: 'ELEMENT'; - range: Range; open: OpenTagNode; close: CloseTagNode; children: (ElementNode | LiteralElementNode | CommentNode | PragmaNode | ValueNode)[]; @@ -415,9 +414,8 @@ export interface CstElementContentNode extends CstNode { * Examples: * - `` */ -export interface CommentNode { +export interface CommentNode extends AstNode { kind: 'COMMENT'; - range: Range; value: LiteralNode; } @@ -444,22 +442,21 @@ export interface CstCommentNode extends CstNode { * - Specify version: `` * - Turn tags on/off: `` * - Turn speaker roles on/off: `` or `single` - * - White space policy: `` or `trim`, `collapse` or `remove` + * - White space policy: `` or `trim`, `collapse` * * Notes on white space policy: * - `pre`: preserve all whitespace as-is * - `trim`: trim leading/trailing whitespace in each element * - `collapse`: trim + collapse consecutive whitespace into a single space - * - `remove`: collapse remove all whitespaces between two nested elements + * If there are two inline="false" elements next to each other, space between them will be deleted. * * Each element type will have its own default whitespace policy. * For example, `` defaults to `pre`, while `` defaults to `collapse`. * However, when a pragma is set, it overrides the default for subsequent elements. * It will affect the AST constructing stages, and also affecting the props sent to components. */ -export interface PragmaNode { +export interface PragmaNode extends AstNode { kind: 'PRAGMA'; - range: Range; identifier: LiteralNode; options: LiteralNode[]; } @@ -507,9 +504,8 @@ export interface CstPragmaNode extends CstNode { * 3. If you really need `` in your POML. Recommended to use `<text>` * outside of literal element. */ -export interface LiteralElementNode { - kind: 'TEXT'; - range: Range; +export interface LiteralElementNode extends AstNode { + kind: 'LITERAL'; open: OpenTagNode; close: CloseTagNode; children: LiteralNode; @@ -543,9 +539,8 @@ export interface CstLiteralElementNode extends CstNode { * Cases that do not apply: * - All nested elements */ -export interface RootNode { +export interface RootNode extends AstNode { kind: 'ROOT'; - range: Range; children: (ElementNode | LiteralElementNode | CommentNode | PragmaNode | ValueNode)[]; } From 0beb690124fcdf6c86dd42deac47eb3568d6b0c2 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Tue, 2 Sep 2025 08:17:40 +0800 Subject: [PATCH 46/76] cst update --- packages/poml/next/cst.ts | 51 ++++++++++++++++++++++++------------- packages/poml/next/nodes.ts | 50 +++++++++++++++++++++--------------- 2 files changed, 63 insertions(+), 38 deletions(-) diff --git a/packages/poml/next/cst.ts b/packages/poml/next/cst.ts index 5c40bedc..dd8b990b 100644 --- a/packages/poml/next/cst.ts +++ b/packages/poml/next/cst.ts @@ -1,4 +1,3 @@ -// cstParser.ts import { CstParser, CstNode, IToken, TokenType } from 'chevrotain'; import { AllTokens, @@ -26,6 +25,23 @@ import { extendedPomlLexer, } from './lexer'; +import { + CstTemplateNode, + CstQuotedNode, + CstQuotedTemplateNode, + CstForIteratorNode, + CstAttributeNode, + CstOpenTagNode, + CstCloseTagNode, + CstSelfCloseElementNode, + CstElementNode, + CstElementContentNode, + CstCommentNode, + CstPragmaNode, + CstLiteralElementNode, + CstRootNode, +} from './nodes'; + /** * Extended POML CST Parser * @@ -36,20 +52,20 @@ import { */ export class ExtendedPomlParser extends CstParser { // ---- Rule property declarations (so TS knows they exist) ---- - public root!: (idxInOriginalText?: number) => CstNode; - public elementContentNode!: (idxInOriginalText?: number) => CstNode; - public templateNode!: (idxInOriginalText?: number) => CstNode; - public comment!: (idxInOriginalText?: number) => CstNode; - public pragma!: (idxInOriginalText?: number) => CstNode; - public quotedNoTemplate!: (idxInOriginalText?: number) => CstNode; - public quotedTemplate!: (idxInOriginalText?: number) => CstNode; - public forIteratorValue!: (idxInOriginalText?: number) => CstNode; - public attribute!: (idxInOriginalText?: number) => CstNode; - public openTag!: (idxInOriginalText?: number) => CstNode; - public closeTag!: (idxInOriginalText?: number) => CstNode; - public selfCloseElement!: (idxInOriginalText?: number) => CstNode; - public element!: (idxInOriginalText?: number) => CstNode; - public literalElement!: (idxInOriginalText?: number) => CstNode; + public root!: (idxInOriginalText?: number) => CstRootNode; + public elementContentNode!: (idxInOriginalText?: number) => CstElementContentNode; + public templateNode!: (idxInOriginalText?: number) => CstTemplateNode; + public comment!: (idxInOriginalText?: number) => CstCommentNode; + public pragma!: (idxInOriginalText?: number) => CstPragmaNode; + public quoted!: (idxInOriginalText?: number) => CstQuotedNode; + public quotedTemplate!: (idxInOriginalText?: number) => CstQuotedTemplateNode; + public forIteratorValue!: (idxInOriginalText?: number) => CstForIteratorNode; + public attribute!: (idxInOriginalText?: number) => CstAttributeNode; + public openTag!: (idxInOriginalText?: number) => CstOpenTagNode; + public closeTag!: (idxInOriginalText?: number) => CstCloseTagNode; + public selfCloseElement!: (idxInOriginalText?: number) => CstSelfCloseElementNode; + public element!: (idxInOriginalText?: number) => CstElementNode; + public literalElement!: (idxInOriginalText?: number) => CstLiteralElementNode; // ---- Small helpers ---- private anyOf = (tokenTypes: TokenType[], label?: string) => @@ -197,7 +213,7 @@ export class ExtendedPomlParser extends CstParser { // Options: unquoted tokens or quoted strings (no templates inside these) this.MANY(() => { this.OR([ - { ALT: () => this.SUBRULE(this.quotedNoTemplate, { LABEL: 'PragmaOption' }) }, + { ALT: () => this.SUBRULE(this.quoted, { LABEL: 'PragmaOption' }) }, { ALT: () => { this.OR( @@ -217,7 +233,7 @@ export class ExtendedPomlParser extends CstParser { this.CONSUME(CommentClose); }); - this.quotedNoTemplate = this.RULE('quotedNoTemplate', () => { + this.quoted = this.RULE('quoted', () => { this.OR([ { ALT: () => { @@ -370,6 +386,7 @@ export class ExtendedPomlParser extends CstParser { this.literalElement = this.RULE('literalElement', () => { this.SUBRULE(this.openTag, { LABEL: 'OpenTag' }); + // TODO: the ending tag should match the starting tag name (text/template) // Everything until the matching or is treated as raw text this.MANY(() => { this.OR([ diff --git a/packages/poml/next/nodes.ts b/packages/poml/next/nodes.ts index 9884324f..7eb4a849 100644 --- a/packages/poml/next/nodes.ts +++ b/packages/poml/next/nodes.ts @@ -18,7 +18,7 @@ export interface AstNode { * * Cases that do not apply: * - Template syntax including braces: `{{ expression }}` (use TemplateNode) - * - String literals with quotes: `"hello"` (use LiteralNode or ValueNode) + * - String literals with quotes: `"hello"` (use ValueNode) * - POML markup: `` (use element nodes) */ export interface ExpressionNode extends AstNode { @@ -98,7 +98,8 @@ export interface LiteralNode extends AstNode { } /** - * Represents a composite value that may contain text and/or templates. + * Represents a composite value that may contain text. + * Used specifically for the "quotes" in attribute values. * * Value nodes are containers for mixed content, handling both pure text * and interpolated templates. They preserve quote information when used @@ -107,15 +108,12 @@ export interface LiteralNode extends AstNode { * Cases that apply: * - Quoted attribute values: `"some text"`, `'single quoted'` * - Mixed content with templates: `"Hello, {{ userName }}!"` - * - Text content between tags: `> some text <` (including whitespace) - * - Unquoted template values in certain contexts + * - Unquoted template values in certain attribute contexts * - Multi-part content: `"Price: ${{amount}} USD"` * * Cases that do not apply: * - Attribute keys: `class=...` (the `class` part uses LiteralNode) - * - Pure expressions without quotes: `if=condition` (use ExpressionNode) - * - Tag names: `div` (use LiteralNode) - * - Standalone template variables not in a value context + * - Pure expressions without quotes: `if=condition` (illegal) * * Note: The range includes quotes if present, but children exclude them. */ @@ -124,11 +122,6 @@ export interface ValueNode extends AstNode { children: (LiteralNode | TemplateNode)[]; } -export interface TextElementNode extends AstNode { - kind: 'TEXT'; - value: string; -} - /** * Related CST node interfaces for parsing stage. * The following two interfaces are for quoted strings and will be transformed into ValueNode. @@ -272,7 +265,7 @@ export interface OpenTagNode extends AstNode { /** * Related CST node interfaces for parsing stage. */ -export interface OpenTagCstNode extends CstNode { +export interface CstOpenTagNode extends CstNode { children: { OpenBracket?: IToken[]; WsAfterBracket?: IToken[]; @@ -308,7 +301,7 @@ export interface CloseTagNode extends AstNode { /** * Related CST node interfaces for parsing stage. */ -export interface CloseTagCstNode extends CstNode { +export interface CstCloseTagNode extends CstNode { children: { ClosingOpenBracket?: IToken[]; WsAfterBracket?: IToken[]; @@ -378,7 +371,21 @@ export interface ElementNode extends AstNode { kind: 'ELEMENT'; open: OpenTagNode; close: CloseTagNode; - children: (ElementNode | LiteralElementNode | CommentNode | PragmaNode | ValueNode)[]; + children: (ElementNode | LiteralElementNode | CommentNode | PragmaNode | TextElementNode)[]; +} + +/** + * Very similar to ValueNode, but specifically for text content between tags. + * + * Cases that apply: + * - Text content between tags: `> some text <` (including whitespace) + * + * Cases that do not apply: + * - Text inside or other literal elements (use LiteralElementNode) + */ +export interface TextElementNode extends AstNode { + kind: 'TEXT'; + value: string; } /** @@ -386,8 +393,8 @@ export interface ElementNode extends AstNode { */ export interface CstElementNode extends CstNode { children: { - OpenTag?: OpenTagCstNode[]; - CloseTag?: CloseTagCstNode[]; + OpenTag?: CstOpenTagNode[]; + CloseTag?: CstCloseTagNode[]; Content?: CstElementContentNode[]; }; } @@ -491,8 +498,8 @@ export interface CstPragmaNode extends CstNode { * - Explicit text elements: `Literal {{ not_interpolated }}` * * Cases that do not apply: - * - Regular text content with interpolation (use ValueNode) - * - Plain text outside elements (use ValueNode) + * - Regular text content with interpolation (use TextElementNode or ValueNode) + * - Plain text outside elements (use TextElementNode) * - Elements allowing template processing (use ElementNode) * - Text with attributes enabling processing (future feature) * @@ -516,11 +523,11 @@ export interface LiteralElementNode extends AstNode { */ export interface CstLiteralElementNode extends CstNode { children: { - OpenTag?: OpenTagCstNode[]; + OpenTag?: CstOpenTagNode[]; // All content between open and close tags is treated as literal text // including other tags, comments, pragmas, etc. except for ``. TextContent?: IToken[]; - CloseTag?: CloseTagCstNode[]; + CloseTag?: CstCloseTagNode[]; }; } @@ -584,6 +591,7 @@ export type StrictNode = | SelfCloseElementNode | ElementNode | LiteralElementNode + | TextElementNode | CommentNode | PragmaNode | RootNode; From bb7ca37389aaa06f3b51e67d572c311f5b7e46ed Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Tue, 2 Sep 2025 09:23:52 +0800 Subject: [PATCH 47/76] update nodes --- packages/poml/next/cst.ts | 31 +++++++++++++++++++++++++++++++ packages/poml/next/nodes.ts | 33 ++++++++++++--------------------- 2 files changed, 43 insertions(+), 21 deletions(-) diff --git a/packages/poml/next/cst.ts b/packages/poml/next/cst.ts index dd8b990b..e93d54fb 100644 --- a/packages/poml/next/cst.ts +++ b/packages/poml/next/cst.ts @@ -41,6 +41,7 @@ import { CstLiteralElementNode, CstRootNode, } from './nodes'; +import { listComponentAliases } from 'poml/base'; /** * Extended POML CST Parser @@ -67,6 +68,25 @@ export class ExtendedPomlParser extends CstParser { public element!: (idxInOriginalText?: number) => CstElementNode; public literalElement!: (idxInOriginalText?: number) => CstLiteralElementNode; + // ---- Tag names for rules (for CST nodes) ---- + private validComponentNames: Set; + + // They are handled in file.tsx currently. + // I think they will be gradually moved to component registry in future. + private validDirectives: Set = new Set([ + 'include', + 'let', + 'output-schema', + 'outputschema', + 'tool-definition', + 'tool-def', + 'tooldef', + 'tool', + 'template', + ]); + // This list affects the CST parser stage only. + private literalTagNames: Set = new Set(['text', 'template']); + // ---- Small helpers ---- private anyOf = (tokenTypes: TokenType[], label?: string) => tokenTypes.map((tt) => ({ @@ -113,14 +133,25 @@ export class ExtendedPomlParser extends CstParser { return false; } const name = (t.image || '').toLowerCase(); + + // TODO: should match the opening tag name return name === 'text' || name === 'template'; }; + private isValidOpenTag = (tagName: string) => { + // When pragma strict is enabled, only known component names are allowed as tags. + // Other component names will show as errors in the semantic analysis stage. + // When pragma strict is not enabled, tag names that are not known components + // will be treated as texts. + return this.validComponentNames.has(tagName.toLowerCase()); + }; + constructor() { super(AllTokens, { outputCst: true, recoveryEnabled: true, }); + this.validComponentNames = new Set(listComponentAliases()); // --------------------------- // RULE DEFINITIONS (as properties) diff --git a/packages/poml/next/nodes.ts b/packages/poml/next/nodes.ts index 7eb4a849..0ccbc3d5 100644 --- a/packages/poml/next/nodes.ts +++ b/packages/poml/next/nodes.ts @@ -264,8 +264,11 @@ export interface OpenTagNode extends AstNode { /** * Related CST node interfaces for parsing stage. + * + * Opening tag without the ending close bracket. + * Allow prefix sharing with SelfCloseElementNode. */ -export interface CstOpenTagNode extends CstNode { +export interface CstOpenTagPartialNode extends CstNode { children: { OpenBracket?: IToken[]; WsAfterBracket?: IToken[]; @@ -273,7 +276,6 @@ export interface CstOpenTagNode extends CstNode { WsAfterName?: IToken[]; Attribute?: CstAttributeNode[]; WsAfterAttribute?: IToken[]; - CloseBracket?: IToken[]; }; } @@ -333,21 +335,6 @@ export interface SelfCloseElementNode extends AstNode { attributes: AttributeNode[]; } -/** - * Related CST node interfaces for parsing stage. - */ -export interface CstSelfCloseElementNode extends CstNode { - children: { - OpenBracket?: IToken[]; - WsAfterBracket?: IToken[]; - TagName?: IToken[]; - WsAfterName?: IToken[]; - Attribute?: CstAttributeNode[]; - WsAfterAttribute?: IToken[]; - SelfCloseBracket?: IToken[]; - }; -} - /** * Represents a complete POML element with its content. * @@ -393,9 +380,12 @@ export interface TextElementNode extends AstNode { */ export interface CstElementNode extends CstNode { children: { - OpenTag?: CstOpenTagNode[]; - CloseTag?: CstCloseTagNode[]; + OpenTagPartial?: CstOpenTagPartialNode[]; + OpenTagCloseBracket?: IToken[]; Content?: CstElementContentNode[]; + CloseTag?: CstCloseTagNode[]; + // Alternative, it can also be a self-closing tag. + SelfCloseBracket?: IToken[]; }; } @@ -403,7 +393,6 @@ export interface CstElementContentNode extends CstNode { children: { Element?: CstElementNode[]; LiteralElement?: CstLiteralElementNode[]; - SelfCloseElement?: CstSelfCloseElementNode[]; Comment?: CstCommentNode[]; Pragma?: CstPragmaNode[]; Template?: CstTemplateNode[]; @@ -523,11 +512,13 @@ export interface LiteralElementNode extends AstNode { */ export interface CstLiteralElementNode extends CstNode { children: { - OpenTag?: CstOpenTagNode[]; + OpenTagPartial?: CstOpenTagPartialNode[]; + OpenTagCloseBracket?: IToken[]; // All content between open and close tags is treated as literal text // including other tags, comments, pragmas, etc. except for ``. TextContent?: IToken[]; CloseTag?: CstCloseTagNode[]; + // Literal element cannot be self-closing. }; } From 5aa67f75b881ba2da28f277ee14bb75881ee504e Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Tue, 2 Sep 2025 10:58:39 +0800 Subject: [PATCH 48/76] . --- packages/poml/next/cst.ts | 109 ++++++++++++++++++------------------ packages/poml/next/nodes.ts | 8 +-- 2 files changed, 57 insertions(+), 60 deletions(-) diff --git a/packages/poml/next/cst.ts b/packages/poml/next/cst.ts index e93d54fb..7eac6f9c 100644 --- a/packages/poml/next/cst.ts +++ b/packages/poml/next/cst.ts @@ -31,9 +31,8 @@ import { CstQuotedTemplateNode, CstForIteratorNode, CstAttributeNode, - CstOpenTagNode, + CstOpenTagPartialNode, CstCloseTagNode, - CstSelfCloseElementNode, CstElementNode, CstElementContentNode, CstCommentNode, @@ -54,17 +53,16 @@ import { listComponentAliases } from 'poml/base'; export class ExtendedPomlParser extends CstParser { // ---- Rule property declarations (so TS knows they exist) ---- public root!: (idxInOriginalText?: number) => CstRootNode; - public elementContentNode!: (idxInOriginalText?: number) => CstElementContentNode; - public templateNode!: (idxInOriginalText?: number) => CstTemplateNode; + public elementContent!: (idxInOriginalText?: number) => CstElementContentNode; + public template!: (idxInOriginalText?: number) => CstTemplateNode; public comment!: (idxInOriginalText?: number) => CstCommentNode; public pragma!: (idxInOriginalText?: number) => CstPragmaNode; public quoted!: (idxInOriginalText?: number) => CstQuotedNode; public quotedTemplate!: (idxInOriginalText?: number) => CstQuotedTemplateNode; public forIteratorValue!: (idxInOriginalText?: number) => CstForIteratorNode; public attribute!: (idxInOriginalText?: number) => CstAttributeNode; - public openTag!: (idxInOriginalText?: number) => CstOpenTagNode; + public openTagPartial!: (idxInOriginalText?: number) => CstOpenTagPartialNode; public closeTag!: (idxInOriginalText?: number) => CstCloseTagNode; - public selfCloseElement!: (idxInOriginalText?: number) => CstSelfCloseElementNode; public element!: (idxInOriginalText?: number) => CstElementNode; public literalElement!: (idxInOriginalText?: number) => CstLiteralElementNode; @@ -160,11 +158,11 @@ export class ExtendedPomlParser extends CstParser { this.root = this.RULE('root', () => { // CstRootNode: { Content?: CstElementContentNode[] } this.MANY(() => { - this.SUBRULE(this.elementContentNode, { LABEL: 'Content' }); + this.SUBRULE(this.elementContent, { LABEL: 'Content' }); }); }); - this.elementContentNode = this.RULE('elementContentNode', () => { + this.elementContent = this.RULE('elementContent', () => { this.OR([ // pragma (must come before raw comment) { @@ -172,18 +170,14 @@ export class ExtendedPomlParser extends CstParser { ALT: () => this.SUBRULE(this.pragma, { LABEL: 'Pragma' }), }, // regular comment - { ALT: () => this.SUBRULE(this.comment, { LABEL: 'Comment' }) }, - - // template { - GATE: () => this.LA(1).tokenType === TemplateOpen, - ALT: () => this.SUBRULE(this.templateNode, { LABEL: 'Template' }), + ALT: () => this.SUBRULE(this.comment, { LABEL: 'Comment' }), }, - // self-close element + // template { - GATE: this.BACKTRACK(this.selfCloseElement), - ALT: () => this.SUBRULE(this.selfCloseElement, { LABEL: 'SelfCloseElement' }), + GATE: () => this.LA(1).tokenType === TemplateOpen, + ALT: () => this.SUBRULE(this.template, { LABEL: 'Template' }), }, // literal element: or is treated as raw text diff --git a/packages/poml/next/nodes.ts b/packages/poml/next/nodes.ts index 0ccbc3d5..0ddfa331 100644 --- a/packages/poml/next/nodes.ts +++ b/packages/poml/next/nodes.ts @@ -273,9 +273,9 @@ export interface CstOpenTagPartialNode extends CstNode { OpenBracket?: IToken[]; WsAfterBracket?: IToken[]; TagName?: IToken[]; - WsAfterName?: IToken[]; + WsBeforeEachAttribute?: IToken[]; Attribute?: CstAttributeNode[]; - WsAfterAttribute?: IToken[]; + WsAfterAll?: IToken[]; }; } @@ -467,9 +467,9 @@ export interface CstPragmaNode extends CstNode { PragmaKeyword?: IToken[]; WsAfterPragma?: IToken[]; PragmaIdentifier?: IToken[]; - WsAfterIdentifier?: IToken[]; + WsBeforeEachOption?: IToken[]; PragmaOption?: (IToken | CstQuotedNode)[]; - WsAfterContent?: IToken[]; + WsAfterAll?: IToken[]; CommentClose?: IToken[]; }; } From 0a7b888bf4e2e42a0f4ced96d2122be5601dc517 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Thu, 4 Sep 2025 14:05:54 +0800 Subject: [PATCH 49/76] review to element --- packages/poml/next/cst.ts | 67 ++++++++++++++++++++++++++++--------- packages/poml/next/nodes.ts | 5 +-- 2 files changed, 54 insertions(+), 18 deletions(-) diff --git a/packages/poml/next/cst.ts b/packages/poml/next/cst.ts index 7eac6f9c..4c73b7fa 100644 --- a/packages/poml/next/cst.ts +++ b/packages/poml/next/cst.ts @@ -91,6 +91,18 @@ export class ExtendedPomlParser extends CstParser { ALT: () => (label ? this.CONSUME(tt, { LABEL: label }) : this.CONSUME(tt)), })); + // Lookahead helper: Check if next is whitespace but next non-whitespace token is not of given type + private isSafeWhitespace = (tokenType: TokenType) => { + if (this.LA(1).tokenType !== Whitespace) { + return false; + } + let k = 2; + while (this.LA(k).tokenType === Whitespace) { + k++; + } + return this.LA(k).tokenType !== tokenType; + }; + private isNextPragma = () => { if (this.LA(1).tokenType !== CommentOpen) { return false; @@ -208,9 +220,9 @@ export class ExtendedPomlParser extends CstParser { this.AT_LEAST_ONE(() => { this.OR([ - // mid-content whitespace: only if NOT followed by TemplateClose { - GATE: () => this.LA(1).tokenType === Whitespace && this.LA(2).tokenType !== TemplateClose, + // mid-content whitespace: only if NOT followed by TemplateClose + GATE: () => this.isSafeWhitespace(TemplateClose), ALT: () => this.CONSUME1(Whitespace, { LABEL: 'Content' }), }, // everything else in TokensExpression except Whitespace (handled above) @@ -323,25 +335,46 @@ export class ExtendedPomlParser extends CstParser { this.OPTION2(() => this.CONSUME2(Whitespace, { LABEL: 'WsAfterIterator' })); this.CONSUME2(Identifier, { LABEL: 'InKeyword' }); this.OPTION3(() => this.CONSUME3(Whitespace, { LABEL: 'WsAfterIn' })); - this.AT_LEAST_ONE(() => { - this.OR(this.anyOf(TokensDoubleQuotedExpression, 'Collection')); - }); - this.OPTION4(() => this.CONSUME4(Whitespace, { LABEL: 'WsAfterCollection' })); + // It's written as a double quoted expression without {{ }} here + // but it will be treated as an expression in the semantic analysis stage. + (this.AT_LEAST_ONE(() => { + this.OR([ + { + GATE: () => this.isSafeWhitespace(DoubleQuote), + ALT: () => this.CONSUME4(Whitespace, { LABEL: 'Collection' }), + }, + ...this.anyOf( + TokensDoubleQuoted.filter((t) => t !== Whitespace), + 'Collection', + ), + ]); + }), + this.OPTION4(() => this.CONSUME5(Whitespace, { LABEL: 'WsAfterCollection' }))); this.CONSUME2(DoubleQuote, { LABEL: 'CloseQuote' }); }, }, { ALT: () => { this.CONSUME(SingleQuote, { LABEL: 'OpenQuote' }); - this.OPTION5(() => this.CONSUME5(Whitespace, { LABEL: 'WsAfterOpen' })); + this.OPTION(() => this.CONSUME(Whitespace, { LABEL: 'WsAfterOpen' })); this.CONSUME3(Identifier, { LABEL: 'Iterator' }); - this.OPTION6(() => this.CONSUME6(Whitespace, { LABEL: 'WsAfterIterator' })); + this.OPTION2(() => this.CONSUME2(Whitespace, { LABEL: 'WsAfterIterator' })); this.CONSUME4(Identifier, { LABEL: 'InKeyword' }); - this.OPTION7(() => this.CONSUME7(Whitespace, { LABEL: 'WsAfterIn' })); - this.AT_LEAST_ONE2(() => { - this.OR(this.anyOf(TokensSingleQuotedExpression, 'Collection')); - }); - this.OPTION8(() => this.CONSUME8(Whitespace, { LABEL: 'WsAfterCollection' })); + this.OPTION3(() => this.CONSUME3(Whitespace, { LABEL: 'WsAfterIn' })); + // Similar for single quoted expression + (this.AT_LEAST_ONE(() => { + this.OR([ + { + GATE: () => this.isSafeWhitespace(SingleQuote), + ALT: () => this.CONSUME4(Whitespace, { LABEL: 'Collection' }), + }, + ...this.anyOf( + TokensSingleQuoted.filter((t) => t !== Whitespace), + 'Collection', + ), + ]); + }), + this.OPTION4(() => this.CONSUME5(Whitespace, { LABEL: 'WsAfterCollection' }))); this.CONSUME2(SingleQuote, { LABEL: 'CloseQuote' }); }, }, @@ -374,19 +407,21 @@ export class ExtendedPomlParser extends CstParser { this.openTagPartial = this.RULE('openTagPartial', () => { this.CONSUME(OpenBracket); - this.OPTION(() => this.CONSUME(Whitespace, { LABEL: 'WsAfterBracket' })); + this.OPTION(() => this.CONSUME(Whitespace, { LABEL: 'WsAfterOpen' })); this.CONSUME(Identifier, { LABEL: 'TagName' }); this.OPTION2(() => this.CONSUME2(Whitespace, { LABEL: 'WsAfterName' })); this.MANY(() => { + this.OPTION3(() => this.CONSUME3(Whitespace, { LABEL: 'WsBeforeEachAttribute' })); this.SUBRULE(this.attribute, { LABEL: 'Attribute' }); - this.OPTION3(() => this.CONSUME3(Whitespace, { LABEL: 'WsAfterAttribute' })); }); + this.OPTION4(() => this.CONSUME4(Whitespace, { LABEL: 'WsAfterAll' })); }); this.closeTag = this.RULE('closeTag', () => { this.CONSUME(ClosingOpenBracket); - this.OPTION(() => this.CONSUME(Whitespace, { LABEL: 'WsAfterBracket' })); + this.OPTION(() => this.CONSUME(Whitespace, { LABEL: 'WsAfterOpen' })); this.CONSUME(Identifier, { LABEL: 'TagName' }); + this.OPTION2(() => this.CONSUME2(Whitespace, { LABEL: 'WsBeforeClose' })); this.CONSUME(CloseBracket); }); diff --git a/packages/poml/next/nodes.ts b/packages/poml/next/nodes.ts index 0ddfa331..163c622e 100644 --- a/packages/poml/next/nodes.ts +++ b/packages/poml/next/nodes.ts @@ -271,7 +271,7 @@ export interface OpenTagNode extends AstNode { export interface CstOpenTagPartialNode extends CstNode { children: { OpenBracket?: IToken[]; - WsAfterBracket?: IToken[]; + WsAfterOpen?: IToken[]; TagName?: IToken[]; WsBeforeEachAttribute?: IToken[]; Attribute?: CstAttributeNode[]; @@ -306,8 +306,9 @@ export interface CloseTagNode extends AstNode { export interface CstCloseTagNode extends CstNode { children: { ClosingOpenBracket?: IToken[]; - WsAfterBracket?: IToken[]; + WsAfterOpen?: IToken[]; TagName?: IToken[]; + WsBeforeClose?: IToken[]; CloseBracket?: IToken[]; }; } From 1ae48193bada6086c2f7de82ff4c54650baa415d Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Thu, 4 Sep 2025 18:04:50 +0800 Subject: [PATCH 50/76] . --- packages/poml/next/cst.ts | 153 +++++++++++++----------------------- packages/poml/next/nodes.ts | 73 +++++------------ 2 files changed, 75 insertions(+), 151 deletions(-) diff --git a/packages/poml/next/cst.ts b/packages/poml/next/cst.ts index 4c73b7fa..d268cc08 100644 --- a/packages/poml/next/cst.ts +++ b/packages/poml/next/cst.ts @@ -37,7 +37,6 @@ import { CstElementContentNode, CstCommentNode, CstPragmaNode, - CstLiteralElementNode, CstRootNode, } from './nodes'; import { listComponentAliases } from 'poml/base'; @@ -64,7 +63,6 @@ export class ExtendedPomlParser extends CstParser { public openTagPartial!: (idxInOriginalText?: number) => CstOpenTagPartialNode; public closeTag!: (idxInOriginalText?: number) => CstCloseTagNode; public element!: (idxInOriginalText?: number) => CstElementNode; - public literalElement!: (idxInOriginalText?: number) => CstLiteralElementNode; // ---- Tag names for rules (for CST nodes) ---- private validComponentNames: Set; @@ -92,15 +90,12 @@ export class ExtendedPomlParser extends CstParser { })); // Lookahead helper: Check if next is whitespace but next non-whitespace token is not of given type - private isSafeWhitespace = (tokenType: TokenType) => { - if (this.LA(1).tokenType !== Whitespace) { - return false; - } - let k = 2; - while (this.LA(k).tokenType === Whitespace) { + private isAlmostClose = (tokenType: TokenType) => { + let k = 1; + if (this.LA(k).tokenType === Whitespace) { k++; } - return this.LA(k).tokenType !== tokenType; + return this.LA(k).tokenType === tokenType; }; private isNextPragma = () => { @@ -114,23 +109,7 @@ export class ExtendedPomlParser extends CstParser { return this.LA(k).tokenType === PragmaKeyword; }; - private isNextLiteralOpenTag = () => { - if (this.LA(1).tokenType !== OpenBracket) { - return false; - } - let k = 2; - while (this.LA(k).tokenType === Whitespace) { - k++; - } - const tName = this.LA(k); - if (tName.tokenType !== Identifier) { - return false; - } - const name = (tName.image || '').toLowerCase(); - return name === 'text' || name === 'template'; - }; - - private isAtLiteralClose = () => { + private isAtLiteralClose = (expectedTagName: string) => { if (this.LA(1).tokenType !== ClosingOpenBracket) { return false; } @@ -144,8 +123,7 @@ export class ExtendedPomlParser extends CstParser { } const name = (t.image || '').toLowerCase(); - // TODO: should match the opening tag name - return name === 'text' || name === 'template'; + return name === expectedTagName.toLowerCase(); }; private isValidOpenTag = (tagName: string) => { @@ -192,12 +170,6 @@ export class ExtendedPomlParser extends CstParser { ALT: () => this.SUBRULE(this.template, { LABEL: 'Template' }), }, - // literal element: or is treated as raw text - this.MANY(() => { - this.OR([ - { - GATE: () => !this.isAtLiteralClose(), - ALT: () => this.OR(this.anyOf(AllTokens, 'TextContent')), - }, - ]); - }); - - this.SUBRULE(this.closeTag, { LABEL: 'CloseTag' }); - }); - this.performSelfAnalysis(); } diff --git a/packages/poml/next/nodes.ts b/packages/poml/next/nodes.ts index 163c622e..6c77e3ea 100644 --- a/packages/poml/next/nodes.ts +++ b/packages/poml/next/nodes.ts @@ -344,22 +344,38 @@ export interface SelfCloseElementNode extends AstNode { * open tag), and may have child content including other elements, text, * or values. * + * It should also support literal elements, which are: + * + * - Special POML elements that treat their content as literal text + * - Prevents template variable interpolation + * - Content is preserved exactly as written, useful for code samples or pre-formatted text + * - When `` is used, the parser eats everything including tags and comments, + * including nested `` itself, until a matching `` is found + * - The tagName can only be "text" and "template" for literal elements + * - If you need `` in your POML content, use `<text>` outside of literal elements + * * Cases that apply: * - Any elements: `...content...` * - Output schemas with templates: `{{ schemaDefinition }}` * - Nested elements: `
    Text
    ` + * - Literal text elements: `Literal {{ not_interpolated }}` (literal elements) * * Cases that do not apply: * - Self-closing elements: `` (use SelfCloseTagNode) * - Literal text content: plain text (use LiteralNode) * - Template variables: `{{ var }}` (use TemplateNode) * - Meta elements: `` tags (use MetaNode) + * + * Note: + * - Literal element node is different from elements which do not support nested tags + * (e.g., ). Literal element node is handled on the CST parsing stage. */ export interface ElementNode extends AstNode { kind: 'ELEMENT'; open: OpenTagNode; close: CloseTagNode; - children: (ElementNode | LiteralElementNode | CommentNode | PragmaNode | TextElementNode)[]; + children: (ElementNode | CommentNode | PragmaNode | TextElementNode)[]; + // isLiteral?: boolean; // True for and or + + // Unmatched tags should not error in cst stage +}); + describe('Helper function sanity', () => { test('images() on template: token lists -> string[], node lists -> nested[]', () => { const { node } = withParser('{{ name }}', (p) => p.template()) as { node: CstTemplateNode }; @@ -392,10 +411,10 @@ function mapChildrenBimorphic( if (Array.isArray(arr)) { for (const v of arr) { if (isToken(v)) { -out.push(mapToken(v)); -} else if (isCstNode(v)) { -out.push(mapNode(v)); -} + out.push(mapToken(v)); + } else if (isCstNode(v)) { + out.push(mapNode(v)); + } // else ignore silently } } From 410c7691ddca9fd88d008cb83f14f5f85894c466 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Fri, 5 Sep 2025 17:35:07 +0800 Subject: [PATCH 61/76] . --- packages/poml/next/cst.ts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/packages/poml/next/cst.ts b/packages/poml/next/cst.ts index 275239b8..c1d9d671 100644 --- a/packages/poml/next/cst.ts +++ b/packages/poml/next/cst.ts @@ -268,6 +268,8 @@ export class ExtendedPomlParser extends CstParser { }); this.singleQuotedTrimmedTokens = this.RULE('singleQuotedTrimmedTokens', () => { + // Trimmed content without leading/trailing whitespace + // Must be non-empty. // Greedily match until the next single quote (allow inner whitespace) this.AT_LEAST_ONE({ GATE: () => !this.atAlmostClose(SingleQuote), From 9de1b54e0b6828c210721b3eda04ab11cab574b0 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Fri, 5 Sep 2025 17:35:27 +0800 Subject: [PATCH 62/76] . --- packages/poml/tests/reader/cst.test.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/packages/poml/tests/reader/cst.test.ts b/packages/poml/tests/reader/cst.test.ts index 5caf23f6..d2ac98ce 100644 --- a/packages/poml/tests/reader/cst.test.ts +++ b/packages/poml/tests/reader/cst.test.ts @@ -15,7 +15,6 @@ import type { CstOpenTagPartialNode, CstCloseTagNode, CstElementNode, - CstLiteralTagTokens, } from 'poml/next/nodes'; function withParser(input: string, run: (p: ExtendedPomlParser) => T) { From 80145d24c275bf2ab5fd587a56d02331b19898a4 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Fri, 5 Sep 2025 18:21:54 +0800 Subject: [PATCH 63/76] . --- packages/poml/next/cst.ts | 1 + packages/poml/next/nodes.ts | 14 +- packages/poml/tests/reader/cst.test.ts | 420 +++++++++++-------------- 3 files changed, 200 insertions(+), 235 deletions(-) diff --git a/packages/poml/next/cst.ts b/packages/poml/next/cst.ts index c1d9d671..4c0fc814 100644 --- a/packages/poml/next/cst.ts +++ b/packages/poml/next/cst.ts @@ -168,6 +168,7 @@ export class ExtendedPomlParser extends CstParser { constructor() { super(AllTokens, { recoveryEnabled: true, + nodeLocationTracking: 'full', }); this.validComponentNames = new Set(listComponentAliases()); diff --git a/packages/poml/next/nodes.ts b/packages/poml/next/nodes.ts index c8928b55..e0f7e007 100644 --- a/packages/poml/next/nodes.ts +++ b/packages/poml/next/nodes.ts @@ -73,7 +73,7 @@ export interface CstTemplateNode extends CstNode { // Content inside {{ and }} is treated as a single expression token. // Eats everything until the next }} (or the whitespace before it). // Handles \{{ and \}} escapes. We won't escape other chars here. - Content?: CstExpressionTokens[]; + Content?: CstTokens[]; // If it's close to the ending }}, try to eat whitespace before it. WsAfterContent?: IToken[]; TemplateClose?: IToken[]; @@ -139,7 +139,7 @@ export interface CstQuotedNode extends CstNode { children: { OpenQuote?: IToken[]; // This is a normal quoted string without templates inside. - Content?: (CstDoubleQuotedTokens | CstSingleQuotedTokens)[]; + Content?: CstTokens[]; CloseQuote?: IToken[]; }; } @@ -148,7 +148,7 @@ export interface CstQuotedTemplateNode extends CstNode { children: { OpenQuote?: IToken[]; // Allows "Hello {{ friend["abc"] }}!" - mix of text and templates (with quotes). - Content?: (CstDoubleQuotedExpressionTokens | CstSingleQuotedExpressionTokens | CstTemplateNode)[]; + Content?: (CstTokens | CstTemplateNode)[]; CloseQuote?: IToken[]; }; } @@ -195,7 +195,7 @@ export interface CstForIteratorNode extends CstNode { // But as we are in a quoted string, we need to handle // backslash escapes like \" and \'. // Greedily match until the next unescaped quote or ws before it. - Collection?: (CstDoubleQuotedTrimmedTokens | CstSingleQuotedTrimmedTokens)[]; + Collection?: CstTokens[]; WsAfterCollection?: IToken[]; CloseQuote?: IToken[]; }; @@ -409,7 +409,7 @@ export interface CstElementNode extends CstNode { OpenTagPartial?: CstOpenTagPartialNode[]; OpenTagCloseBracket?: IToken[]; Content?: CstElementContentNode[]; - TextContent?: CstLiteralTagTokens[]; // For literal elements like + TextContent?: CstTokens[]; // For literal elements like CloseTag?: CstCloseTagNode[]; // Alternative, it can also be a self-closing tag. SelfCloseBracket?: IToken[]; @@ -422,7 +422,7 @@ export interface CstElementContentNode extends CstNode { Comment?: CstCommentNode[]; Pragma?: CstPragmaNode[]; Template?: CstTemplateNode[]; - TextContent?: CstBetweenTagsTokens[]; + TextContent?: CstTokens[]; }; } @@ -447,7 +447,7 @@ export interface CommentNode extends AstNode { export interface CstCommentNode extends CstNode { children: { CommentOpen?: IToken[]; - Content?: CstCommentTokens[]; + Content?: CstTokens[]; CommentClose?: IToken[]; }; } diff --git a/packages/poml/tests/reader/cst.test.ts b/packages/poml/tests/reader/cst.test.ts index d2ac98ce..5fda1bd5 100644 --- a/packages/poml/tests/reader/cst.test.ts +++ b/packages/poml/tests/reader/cst.test.ts @@ -2,7 +2,7 @@ import { describe, expect, test } from '@jest/globals'; import { CstNode, IToken } from 'chevrotain'; import { ExtendedPomlParser } from 'poml/next/cst'; import { extendedPomlLexer, Whitespace, Identifier } from 'poml/next/lexer'; -import type { +import { CstRootNode, CstElementContentNode, CstTemplateNode, @@ -225,6 +225,8 @@ describe('Special Tokens', () => { done`; const { node } = withParser(input, (p) => p.root()) as { node: CstRootNode }; console.dir(images(node), { depth: null }); + console.dir(names(node), { depth: null }); + console.dir(locations(node), { depth: null }); }); // All kinds of whitespaces @@ -236,255 +238,217 @@ done`; // Unmatched tags should not error in cst stage }); -describe('Helper function sanity', () => { - test('images() on template: token lists -> string[], node lists -> nested[]', () => { - const { node } = withParser('{{ name }}', (p) => p.template()) as { node: CstTemplateNode }; +/* -------------------- tiny guards -------------------- */ +const isToken = (x: unknown): x is IToken => !!x && typeof (x as IToken).image === 'string'; - const snap = images(node) as ImagesTree; - - // Token-only props => string[] - expect(Array.isArray(snap.TemplateOpen)).toBe(true); - expect(typeof snap.TemplateOpen![0]).toBe('string'); - expect(snap.TemplateOpen![0]).toBe('{{'); - - expect(Array.isArray(snap.TemplateClose)).toBe(true); - expect(typeof snap.TemplateClose![0]).toBe('string'); - expect(snap.TemplateClose![0]).toBe('}}'); - - // Node-only prop => nested[] - expect(Array.isArray(snap.Content)).toBe(true); - expect(typeof snap.Content![0]).toBe('object'); // nested tree, not string - // Nested should mirror structure (has children keys) - expect(snap.Content![0]).toBeDefined(); - - // Present keys are never undefined - for (const k of Object.keys(node.children)) { - // @ts-expect-error runtime check - expect(snap[k]).toBeDefined(); - // @ts-expect-error runtime check - expect(Array.isArray(snap[k])).toBe(true); - } - }); +const isCstNode = (x: unknown): x is CstNode => + !!x && typeof (x as any).name === 'string' && typeof (x as any).children === 'object'; - test('names() shape: has { name, children } and token items are tokenType names', () => { - const { node } = withParser('{{ name }}', (p) => p.template()) as { node: CstTemplateNode }; - const snap = names(node) as NamesTree; - - expect(snap.name).toBe('template'); - expect(snap.children).toBeDefined(); - - // Token-only -> string (tokenType name) - const tokName = snap.children.TemplateOpen?.[0]; - expect(typeof tokName).toBe('string'); - expect(tokName!.length).toBeGreaterThan(0); - - // Node-only -> nested NamesTree - const nested = snap.children.Content?.[0]; - expect(typeof nested).toBe('object'); - expect((nested as any).name).toBeDefined(); - expect((nested as any).children).toBeDefined(); - - // Never undefined for present keys - for (const k of Object.keys(node.children)) { - // @ts-expect-error runtime check - expect(Array.isArray(snap.children[k])).toBe(true); - } - }); +/* -------------------- ranges -------------------- */ +const tokStart = (t: IToken) => (typeof t.startOffset === 'number' ? t.startOffset : 0); +const tokEnd = (t: IToken) => (typeof t.endOffset === 'number' ? t.endOffset : tokStart(t) + (t.image?.length ?? 0)); - test('locations() shape: top {start,end}, tokens -> {start,end}, nodes -> nested', () => { - const { node } = withParser('{{ name }}', (p) => p.template()) as { node: CstTemplateNode }; - const snap = locations(node) as LocationsTree; - - expect(typeof snap.start).toBe('number'); - expect(typeof snap.end).toBe('number'); - - // Token-only -> {start,end} - const tokLoc = snap.children.TemplateOpen?.[0] as any; - expect(typeof tokLoc.start).toBe('number'); - expect(typeof tokLoc.end).toBe('number'); - - // Node-only -> nested LocationsTree - const nested = snap.children.Content?.[0] as any; - expect(typeof nested).toBe('object'); - expect(typeof nested.start).toBe('number'); - expect(typeof nested.end).toBe('number'); - - // Never undefined for present keys - for (const k of Object.keys(node.children)) { - // @ts-expect-error runtime check - expect(Array.isArray(snap.children[k])).toBe(true); - } - }); +function* walkTokens(value: unknown): Generator { + if (isToken(value)) { + yield value; + return; + } + if (Array.isArray(value)) { + for (const v of value) { +yield* walkTokens(v); +} + return; + } + if (isCstNode(value)) { + const ch = (value as any).children as Record; + for (const k of Object.keys(ch)) { +yield* walkTokens(ch[k]); +} + } +} - test('Literal element TextContent maps tokens to strings with images()', () => { - const input = 'Hello {{ name }} '; - const { node } = withParser(input, (p) => p.element()) as { node: CstElementNode }; +function nodeRange(node: CstNode): { start: number; end: number } { + let start = Infinity, + end = -Infinity; + for (const t of walkTokens(node)) { + start = Math.min(start, tokStart(t)); + end = Math.max(end, tokEnd(t)); + } + if (!Number.isFinite(start) || !Number.isFinite(end)) { +return { start: 0, end: 0 }; +} + return { start, end }; +} - const snap = images(node) as ImagesTree; - const textArr = snap.TextContent!; - expect(Array.isArray(textArr)).toBe(true); - // TextContent is token-only; each item should be string[] - const flat = textArr[0] as unknown as any; // nested ImagesTree for CstLiteralTagTokens - // dive one level to the actual token list on the literal node - const contentStrings: string[] = flat.Content; - // If structure differs, we still check there is at least one string present somewhere - const hasStringDeep = Array.isArray(contentStrings) ? typeof contentStrings[0] === 'string' : true; - expect(hasStringDeep).toBe(true); - }); -}); +/* -------------------- core normalize -------------------- */ +/** + * Rules: + * - drop undefined + * - arrays: [] -> undefined; [x] -> x; [strings...] -> joined string; otherwise keep (with inner normalize) + * - objects: normalize recursively; if only key is "Content" -> unwrap value + */ +function normalizeAny(v: unknown): unknown { + if (v == null) { +return undefined; +} + if (Array.isArray(v)) { +return normalizeArray(v); +} + if (isToken(v) || isCstNode(v)) { +return v; +} + if (typeof v === 'object') { +return normalizeObject(v as Record); +} + return v; +} -type ElemOf
    = A extends Array ? U : never; - -/** Map a union element (token | node) into different output types per branch. */ -type MapElem = TokenOrNode extends IToken - ? TokOut - : TokenOrNode extends CstNode - ? NodeOut - : never; - -/** images(): tokens -> string; nodes -> nested ImagesTree */ -export type ImagesTree = { - [K in keyof T['children']]?: Array< - MapElem< - ElemOf>, - string, - ImagesTree>, CstNode>> - > - >; -}; +function normalizeArray(arr: unknown[]): unknown { + const mapped = arr.map(normalizeAny).filter((v) => v !== undefined); -/** names(): shape is { name, children }; tokens -> tokenType.name; nodes -> nested */ -export type NamesTree = { - name: string; - children: { - [K in keyof T['children']]?: Array< - MapElem< - ElemOf>, - string, - NamesTree>, CstNode>> - > - >; - }; -}; + if (mapped.length === 0) { +return undefined; +} + if (mapped.every((x) => typeof x === 'string')) { + // concatenate pure string arrays + return (mapped as string[]).join(''); + } + if (mapped.length === 1) { +return mapped[0]; +} + return mapped; +} -/** locations(): shape is { start, end, children }; tokens -> {start,end}; nodes -> nested */ -export type RangeLite = { start: number; end: number }; - -export type LocationsTree = { - start: number; - end: number; - children: { - [K in keyof T['children']]?: Array< - MapElem< - ElemOf>, - RangeLite, - LocationsTree>, CstNode>> - > - >; - }; +function normalizeObject(obj: Record): unknown { + const out: Record = {}; + for (const [k, v] of Object.entries(obj)) { + const nv = normalizeAny(v); + if (nv !== undefined) { +out[k] = nv; +} + } + const keys = Object.keys(out); + if (keys.length === 0) { +return undefined; +} + if (keys.length === 1 && keys[0] === 'Content') { +return out.Content; +} + return out; +} + +function normalizeChildren(node: CstNode): unknown { + return normalizeObject(node.children as Record); +} + +/* -------------------- generic transformer -------------------- */ +type Mode = 'images' | 'names' | 'locations'; + +type Strategies = { + onToken(v: IToken): unknown; // what to emit for a token + onNodeWrap(n: CstNode, children: unknown): unknown; // how to wrap a CST node around its transformed children + keepChildKey(k: string, v: unknown): boolean; // allow pruning of token-only branches }; -function isToken(u: unknown): u is IToken { - return !!u && typeof (u as any).image === 'string'; +function transformValue(val: unknown, S: Strategies): unknown { + if (val == null) { +return undefined; +} + + if (isToken(val)) { + return S.onToken(val); + } + + if (Array.isArray(val)) { + const mapped = val.map((x) => transformValue(x, S)).filter((x) => x !== undefined); + if (mapped.length === 0) { +return undefined; } -function isCstNode(u: unknown): u is CstNode { - return !!u && typeof (u as any).name === 'string' && typeof (u as any).children === 'object'; + if (mapped.every((x) => typeof x === 'string')) { +return (mapped as string[]).join(''); } + if (mapped.length === 1) { +return mapped[0]; +} + return mapped; + } -/** - * Core mapper (bi-morphic: tokens and nodes can map to DIFFERENT output types) - * - Always returns arrays for any present child key (never undefined). - */ -function mapChildrenBimorphic( - node: T, - mapToken: (t: IToken) => TokOut, - mapNode: (n: CstNode) => NodeOut, -): { [K in keyof T['children']]?: Array>, TokOut, NodeOut>> } { - const result: Record = {}; - const kids = (node.children ?? {}) as Record; - - for (const key of Object.keys(kids)) { - const arr = kids[key] as unknown[]; - // Always create the array (never leave it undefined) - const out: unknown[] = []; - if (Array.isArray(arr)) { - for (const v of arr) { - if (isToken(v)) { - out.push(mapToken(v)); - } else if (isCstNode(v)) { - out.push(mapNode(v)); - } - // else ignore silently - } + if (isCstNode(val)) { + const norm = normalizeChildren(val); + const inner = transformValue(norm, S); + return S.onNodeWrap(val, inner); + } + + if (typeof val === 'object') { + const out: Record = {}; + for (const [k, v] of Object.entries(val)) { + const mv = transformValue(v, S); + if (mv !== undefined && S.keepChildKey(k, mv)) { +out[k] = mv; +} } - result[key] = out; // defined even if empty + const keys = Object.keys(out); + if (keys.length === 0) { +return undefined; +} + if (keys.length === 1 && keys[0] === 'Content') { +return out.Content; +} + return out; } - // The cast is safe: each element was mapped via the correct branch. - return result as any; + // primitive fallback: pass through (lets string concatenation work if present) + return val; } -/** - * images(node): for each child array - * - if it’s tokens → string[] - * - if it’s nodes → ImagesTree[] - * - if mixed → (string | ImagesTree)[] - * Arrays are always present for seen keys; never undefined. - */ -export function images(node: T): ImagesTree { - const children = mapChildrenBimorphic( - node, - (t) => t.image, - (n) => images(n), - ); - return children as ImagesTree; +/* -------------------- concrete modes -------------------- */ + +// images(): leaves become strings; nested objects keyed by child names. +// Token arrays get concatenated (via normalize/transform). +export function images(node: CstNode): unknown { + const S: Strategies = { + onToken: (t) => t.image, // keep token text + onNodeWrap: (_n, children) => children, // node name not included; just the nested children map + keepChildKey: (_k, _v) => true, // keep everything + }; + return transformValue(normalizeChildren(node), S); } -/** - * names(node): { name, children }, tokens → tokenType.name - * Arrays are always present for seen keys; never undefined. - */ -export function names(node: T): NamesTree { - const children = mapChildrenBimorphic( - node, - (t) => t.tokenType?.name ?? '(UnknownToken)', - (n) => names(n), - ); - return { - name: node.name, - children: children as NamesTree['children'], +// names(): only node names; omit token leaves entirely. +export function names(node: CstNode): { name: string; children?: Record } { + const S: Strategies = { + onToken: (_t) => undefined, // drop token leaves + onNodeWrap: (n, children) => { + const out: { name: string; children?: Record } = { name: n.name }; + if (children && typeof children === 'object' && !Array.isArray(children)) { + const keys = Object.keys(children as Record); + if (keys.length) { +out.children = children as Record; +} + } + return out; + }, + // prune keys that are purely token-derived (which would be undefined) + keepChildKey: (_k, v) => v !== undefined, }; + return transformValue(node, S) as any; } -/** - * locations(node): { start, end, children }, tokens → {start,end} - * Arrays are always present for seen keys; never undefined. - */ -export function locations(node: T): LocationsTree { - // Chevrotain differences: prefer location.startOffset/endOffset; fallback to start/end; else -1. - const start = - node.location?.startOffset ?? - // @ts-expect-error - node.location?.start ?? - -1; - const end = - node.location?.endOffset ?? - // @ts-expect-error - node.location?.end ?? - -1; - - const children = mapChildrenBimorphic( - node, - (t) => ({ - start: (t as any).startOffset ?? -1, - end: (t as any).endOffset ?? -1, - }), - (n) => locations(n), - ); - - return { - start, - end, - children: children as LocationsTree['children'], +// locations(): node-level { start,end } only; omit token-level ranges. +export function locations(node: CstNode): { start: number; end: number; children?: Record } { + const S: Strategies = { + onToken: (_t) => undefined, // drop token ranges + onNodeWrap: (n, children) => { + const base: { start: number; end: number; children?: Record } = nodeRange(n); + if (children && typeof children === 'object' && !Array.isArray(children)) { + const keys = Object.keys(children as Record); + if (keys.length) { +base.children = children as Record; +} + } + return base; + }, + keepChildKey: (_k, v) => v !== undefined, }; + return transformValue(node, S) as any; } From 586897ff375170f3a89a8d56980f54e3eaeb71fc Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Fri, 5 Sep 2025 18:26:16 +0800 Subject: [PATCH 64/76] . --- packages/poml/tests/reader/cst.test.ts | 140 ++++++++++++++++--------- 1 file changed, 89 insertions(+), 51 deletions(-) diff --git a/packages/poml/tests/reader/cst.test.ts b/packages/poml/tests/reader/cst.test.ts index 5fda1bd5..e3239708 100644 --- a/packages/poml/tests/reader/cst.test.ts +++ b/packages/poml/tests/reader/cst.test.ts @@ -224,9 +224,36 @@ describe('Special Tokens', () => { done`; const { node } = withParser(input, (p) => p.root()) as { node: CstRootNode }; - console.dir(images(node), { depth: null }); - console.dir(names(node), { depth: null }); - console.dir(locations(node), { depth: null }); + expect(images(node)).toStrictEqual([ + { TextContent: 'Hello ' }, + { + Template: { + TemplateOpen: '{{', + WsAfterOpen: ' ', + Content: 'user', + WsAfterContent: ' ', + TemplateClose: '}}', + }, + }, + { TextContent: '!\n' }, + { + Comment: { + CommentOpen: '', + }, + }, + { TextContent: ' ' }, + { + Element: { + OpenTagPartial: { OpenBracket: '<', TagName: 'text' }, + OpenTagCloseBracket: '>', + TextContent: 'Some text arbi&rary; symbols\\etc/>' }, + }, + }, + { TextContent: '\n\ndone' }, + ]); }); // All kinds of whitespaces @@ -255,15 +282,15 @@ function* walkTokens(value: unknown): Generator { } if (Array.isArray(value)) { for (const v of value) { -yield* walkTokens(v); -} + yield* walkTokens(v); + } return; } if (isCstNode(value)) { const ch = (value as any).children as Record; for (const k of Object.keys(ch)) { -yield* walkTokens(ch[k]); -} + yield* walkTokens(ch[k]); + } } } @@ -275,8 +302,8 @@ function nodeRange(node: CstNode): { start: number; end: number } { end = Math.max(end, tokEnd(t)); } if (!Number.isFinite(start) || !Number.isFinite(end)) { -return { start: 0, end: 0 }; -} + return { start: 0, end: 0 }; + } return { start, end }; } @@ -289,17 +316,17 @@ return { start: 0, end: 0 }; */ function normalizeAny(v: unknown): unknown { if (v == null) { -return undefined; -} + return undefined; + } if (Array.isArray(v)) { -return normalizeArray(v); -} + return normalizeArray(v); + } if (isToken(v) || isCstNode(v)) { -return v; -} + return v; + } if (typeof v === 'object') { -return normalizeObject(v as Record); -} + return normalizeObject(v as Record); + } return v; } @@ -307,15 +334,15 @@ function normalizeArray(arr: unknown[]): unknown { const mapped = arr.map(normalizeAny).filter((v) => v !== undefined); if (mapped.length === 0) { -return undefined; -} + return undefined; + } if (mapped.every((x) => typeof x === 'string')) { // concatenate pure string arrays return (mapped as string[]).join(''); } if (mapped.length === 1) { -return mapped[0]; -} + return mapped[0]; + } return mapped; } @@ -324,16 +351,16 @@ function normalizeObject(obj: Record): unknown { for (const [k, v] of Object.entries(obj)) { const nv = normalizeAny(v); if (nv !== undefined) { -out[k] = nv; -} + out[k] = nv; + } } const keys = Object.keys(out); if (keys.length === 0) { -return undefined; -} + return undefined; + } if (keys.length === 1 && keys[0] === 'Content') { -return out.Content; -} + return out.Content; + } return out; } @@ -352,8 +379,8 @@ type Strategies = { function transformValue(val: unknown, S: Strategies): unknown { if (val == null) { -return undefined; -} + return undefined; + } if (isToken(val)) { return S.onToken(val); @@ -362,14 +389,14 @@ return undefined; if (Array.isArray(val)) { const mapped = val.map((x) => transformValue(x, S)).filter((x) => x !== undefined); if (mapped.length === 0) { -return undefined; -} + return undefined; + } if (mapped.every((x) => typeof x === 'string')) { -return (mapped as string[]).join(''); -} + return (mapped as string[]).join(''); + } if (mapped.length === 1) { -return mapped[0]; -} + return mapped[0]; + } return mapped; } @@ -384,16 +411,16 @@ return mapped[0]; for (const [k, v] of Object.entries(val)) { const mv = transformValue(v, S); if (mv !== undefined && S.keepChildKey(k, mv)) { -out[k] = mv; -} + out[k] = mv; + } } const keys = Object.keys(out); if (keys.length === 0) { -return undefined; -} + return undefined; + } if (keys.length === 1 && keys[0] === 'Content') { -return out.Content; -} + return out.Content; + } return out; } @@ -414,37 +441,48 @@ export function images(node: CstNode): unknown { return transformValue(normalizeChildren(node), S); } -// names(): only node names; omit token leaves entirely. +// names(): only node names; omit token leaves entirely, but KEEP the full node tree. +// If children collapse to an array/primitive, tuck under { Content: ... } so we don't lose the branch. export function names(node: CstNode): { name: string; children?: Record } { const S: Strategies = { onToken: (_t) => undefined, // drop token leaves onNodeWrap: (n, children) => { - const out: { name: string; children?: Record } = { name: n.name }; - if (children && typeof children === 'object' && !Array.isArray(children)) { - const keys = Object.keys(children as Record); - if (keys.length) { + const out: { name: string; children?: Record | unknown[] } = { name: n.name }; + if (children !== undefined) { + if (typeof children === 'object' && !Array.isArray(children)) { + // plain object: use as-is + const keys = Object.keys(children as Record); + if (keys.length) { out.children = children as Record; } + } else { + // array or primitive: wrap under Content + out.children = children as unknown[]; + } } return out; }, - // prune keys that are purely token-derived (which would be undefined) keepChildKey: (_k, v) => v !== undefined, }; return transformValue(node, S) as any; } // locations(): node-level { start,end } only; omit token-level ranges. +// Same "wrap under Content if not a plain object" rule to preserve shape. export function locations(node: CstNode): { start: number; end: number; children?: Record } { const S: Strategies = { onToken: (_t) => undefined, // drop token ranges onNodeWrap: (n, children) => { - const base: { start: number; end: number; children?: Record } = nodeRange(n); - if (children && typeof children === 'object' && !Array.isArray(children)) { - const keys = Object.keys(children as Record); - if (keys.length) { + const base: { start: number; end: number; children?: Record | unknown[] } = nodeRange(n); + if (children !== undefined) { + if (typeof children === 'object' && !Array.isArray(children)) { + const keys = Object.keys(children as Record); + if (keys.length) { base.children = children as Record; } + } else { + base.children = children as unknown[]; + } } return base; }, From 1f4acffa69658de6713b8ddb7e0470054b340ca4 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Mon, 8 Sep 2025 18:09:06 +0800 Subject: [PATCH 65/76] cst test names and locations --- packages/poml/tests/reader/cst.test.ts | 111 ++++++++++++++++++++++++- 1 file changed, 107 insertions(+), 4 deletions(-) diff --git a/packages/poml/tests/reader/cst.test.ts b/packages/poml/tests/reader/cst.test.ts index e3239708..2afd73a7 100644 --- a/packages/poml/tests/reader/cst.test.ts +++ b/packages/poml/tests/reader/cst.test.ts @@ -254,6 +254,109 @@ done`; }, { TextContent: '\n\ndone' }, ]); + + expect(names(node)).toStrictEqual({ + name: 'root', + children: [ + { + name: 'elementContent', + children: { TextContent: { name: 'betweenTagsTokens' } }, + }, + { + name: 'elementContent', + children: { + Template: { name: 'template', children: { name: 'expressionTokens' } }, + }, + }, + { + name: 'elementContent', + children: { TextContent: { name: 'betweenTagsTokens' } }, + }, + { + name: 'elementContent', + children: { + Comment: { name: 'comment', children: { name: 'commentTokens' } }, + }, + }, + { + name: 'elementContent', + children: { TextContent: { name: 'betweenTagsTokens' } }, + }, + { + name: 'elementContent', + children: { + Element: { + name: 'element', + children: { + OpenTagPartial: { name: 'openTagPartial' }, + TextContent: { name: 'literalTagTokens' }, + CloseTag: { name: 'closeTag' }, + }, + }, + }, + }, + { + name: 'elementContent', + children: { TextContent: { name: 'betweenTagsTokens' } }, + }, + ], + }); + + expect(locations(node)).toStrictEqual({ + start: 0, + end: 92, + children: [ + { + start: 0, + end: 5, + children: { TextContent: { start: 0, end: 5 } }, + }, + { + start: 6, + end: 15, + children: { + Template: { start: 6, end: 15, children: { start: 9, end: 12 } }, + }, + }, + { + start: 16, + end: 17, + children: { TextContent: { start: 16, end: 17 } }, + }, + { + start: 18, + end: 35, + children: { + Comment: { start: 18, end: 35, children: { start: 22, end: 32 } }, + }, + }, + { + start: 36, + end: 37, + children: { TextContent: { start: 36, end: 37 } }, + }, + { + start: 38, + end: 86, + children: { + Element: { + start: 38, + end: 86, + children: { + OpenTagPartial: { start: 38, end: 42 }, + TextContent: { start: 44, end: 79 }, + CloseTag: { start: 80, end: 86 }, + }, + }, + }, + }, + { + start: 87, + end: 92, + children: { TextContent: { start: 87, end: 92 } }, + }, + ], + }); }); // All kinds of whitespaces @@ -453,8 +556,8 @@ export function names(node: CstNode): { name: string; children?: Record); if (keys.length) { -out.children = children as Record; -} + out.children = children as Record; + } } else { // array or primitive: wrap under Content out.children = children as unknown[]; @@ -478,8 +581,8 @@ export function locations(node: CstNode): { start: number; end: number; children if (typeof children === 'object' && !Array.isArray(children)) { const keys = Object.keys(children as Record); if (keys.length) { -base.children = children as Record; -} + base.children = children as Record; + } } else { base.children = children as unknown[]; } From d31297c39ae7c99cd3a6db26ffc756884254a0cf Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Mon, 8 Sep 2025 19:23:27 +0800 Subject: [PATCH 66/76] cst more tests --- packages/poml/next/cst.ts | 2 +- packages/poml/tests/reader/cst.test.ts | 364 ++++++++++++++++++++++++- 2 files changed, 357 insertions(+), 9 deletions(-) diff --git a/packages/poml/next/cst.ts b/packages/poml/next/cst.ts index 4c0fc814..26889dc3 100644 --- a/packages/poml/next/cst.ts +++ b/packages/poml/next/cst.ts @@ -67,7 +67,7 @@ export class ExtendedPomlParser extends CstParser { public singleQuotedExpressionTokens!: (idxInOriginalText?: number) => CstTokens; public betweenTagsTokens!: (idxInOriginalText?: number) => CstTokens; // Accepting expectedTagName as argument to validate matching close tag - public literalTagTokens!: (idxInOriginalText?: number, args?: [string]) => CstLiteralTagTokens; + public literalTagTokens!: (idxInOriginalText?: number, args?: [string]) => CstTokens; // regular rules public template!: (idxInOriginalText?: number) => CstTemplateNode; public comment!: (idxInOriginalText?: number) => CstCommentNode; diff --git a/packages/poml/tests/reader/cst.test.ts b/packages/poml/tests/reader/cst.test.ts index 2afd73a7..48d11548 100644 --- a/packages/poml/tests/reader/cst.test.ts +++ b/packages/poml/tests/reader/cst.test.ts @@ -15,15 +15,18 @@ import { CstOpenTagPartialNode, CstCloseTagNode, CstElementNode, + CstTokens, } from 'poml/next/nodes'; -function withParser(input: string, run: (p: ExtendedPomlParser) => T) { +function withParser(input: string, run: (p: ExtendedPomlParser) => T, raiseOnError?: boolean) { const lex = extendedPomlLexer.tokenize(input); const parser = new ExtendedPomlParser(); parser.input = lex.tokens; const node = run(parser); - expect(parser.errors).toHaveLength(0); - return { node, parser, tokens: lex.tokens }; + if (raiseOnError || raiseOnError === undefined) { + expect(parser.errors).toHaveLength(0); + } + return { node, parser, tokens: lex.tokens, errors: parser.errors }; } describe('CST Parser Rules', () => { @@ -183,7 +186,7 @@ describe('CST Parser Rules', () => { expect(node.children.OpenTagCloseBracket?.[0].image).toBe('>'); // Literal elements should store raw tokens under TextContent (no Template child) expect(node.children.TextContent?.length).toBeGreaterThan(0); - const content = node.children.TextContent?.[0] as CstLiteralTagTokens; + const content = node.children.TextContent?.[0] as CstTokens; const images = content.children.Content?.map((t) => t.image) || []; expect(images).toContain('{{'); expect(images).toContain('}}'); @@ -359,13 +362,358 @@ done`; }); }); - // All kinds of whitespaces + test('all kinds of whitespaces', () => { + const input = `\t\n\r <\tdocument\t >\n\t   {{   name }}\r\n\t   \t\n`; + const { node } = withParser(input, (p) => p.root()) as { node: CstRootNode }; + + expect(images(node)).toStrictEqual([ + { TextContent: '\t\n\r ' }, + { + Element: { + OpenTagPartial: { + OpenBracket: '<', + WsAfterOpen: '\t', + TagName: 'document', + WsAfterAll: '\t ', + }, + OpenTagCloseBracket: '>', + Content: [ + { TextContent: '\n\t   ' }, + { + Template: { + TemplateOpen: '{{', + WsAfterOpen: ' ', + Content: '  name', + WsAfterContent: ' ', + TemplateClose: '}}', + }, + }, + { TextContent: '\r\n\t' }, + ], + CloseTag: { + ClosingOpenBracket: '', + }, + }, + }, + { TextContent: '   \t\n' }, + ]); + }); + + test('single quotes vs double quotes edge cases', () => { + const input = `< div id='single' class="double" > {{ 'nested "quote"' }} `; + const { node } = withParser(input, (p) => p.root()) as { node: CstRootNode }; + + expect(images(node)).toStrictEqual({ + Element: { + OpenTagPartial: { + OpenBracket: '<', + TagName: 'div', + Attribute: [ + { AttributeKey: 'id', Equals: '=', quotedValue: { OpenQuote: "'", Content: 'single', CloseQuote: "'" } }, + { AttributeKey: 'class', Equals: '=', quotedValue: { OpenQuote: '"', Content: 'double', CloseQuote: '"' } }, + ], + WsBeforeEachAttribute: ' ', + WsAfterOpen: ' ', + WsAfterAll: ' ', + }, + OpenTagCloseBracket: '>', + Content: [ + { TextContent: ' ' }, + { + Template: { + TemplateOpen: '{{', + WsAfterOpen: ' ', + Content: '\'nested "quote"\'', + WsAfterContent: ' ', + TemplateClose: '}}', + }, + }, + { TextContent: ' ' }, + ], + CloseTag: { + ClosingOpenBracket: '', + WsAfterOpen: ' ', + WsBeforeClose: ' ', + }, + }, + }); + }); + + test('empty quotes edge cases', () => { + const input = ``; + const { node } = withParser(input, (p) => p.root()) as { node: CstRootNode }; + + expect(images(node)).toStrictEqual({ + Element: { + OpenTagPartial: { + OpenBracket: '<', + TagName: 'tag', + Attribute: [ + { AttributeKey: 'attr1', Equals: '=', quotedValue: { OpenQuote: '"', CloseQuote: '"' } }, + { AttributeKey: 'attr2', Equals: '=', quotedValue: { OpenQuote: "'", CloseQuote: "'" } }, + ], + WsBeforeEachAttribute: ' ', + }, + OpenTagCloseBracket: '>', + CloseTag: { ClosingOpenBracket: '' }, + }, + }); + }); + + test('matched text element with literal content', () => { + const input = `Hello {{ world }} and nested`; + const { node } = withParser(input, (p) => p.root()) as { node: CstRootNode }; + + expect(images(node)).toStrictEqual({ + Element: { + OpenTagPartial: { OpenBracket: '<', TagName: 'text' }, + OpenTagCloseBracket: '>', + TextContent: 'Hello {{ world }} and nested', + CloseTag: { ClosingOpenBracket: '' }, + }, + }); + }); + + test('mismatched tags - text opening with template closing', () => { + const input = `Content here`; + const { node } = withParser(input, (p) => p.root()) as { node: CstRootNode }; + + expect(images(node)).toStrictEqual({ + Element: { + CloseTag: { CloseBracket: '>', ClosingOpenBracket: '', + OpenTagPartial: { OpenBracket: '<', TagName: 'text' }, + TextContent: 'Content here', + }, + }); + }); + + test('completely unmatched tags should not error', () => { + const input = `content
    more

    `; + const { node } = withParser(input, (p) => p.root()) as { node: CstRootNode }; + + expect(images(node)).toStrictEqual([ + { + Element: { + OpenTagPartial: { OpenBracket: '<', TagName: 'document' }, + OpenTagCloseBracket: '>', + Content: { TextContent: 'content' }, + CloseTag: { ClosingOpenBracket: '' }, + }, + }, + { + Element: { + OpenTagPartial: { OpenBracket: '<', TagName: 'span' }, + OpenTagCloseBracket: '>', + Content: { TextContent: 'more' }, + CloseTag: { ClosingOpenBracket: '' }, + }, + }, + ]); + }); - // Single quotes, double quotes, and corner cases + test('nested quoted templates with mixed quotes', () => { + const input = `
    'World'
    `; + const { node } = withParser(input, (p) => p.root()) as { node: CstRootNode }; - // Matched and or + expect(images(node)).toStrictEqual({ + Element: { + OpenTagPartial: { + OpenBracket: '<', + TagName: 'div', + WsBeforeEachAttribute: ' ', + Attribute: [ + { + AttributeKey: 'title', + Equals: '=', + quotedValue: { + OpenQuote: '"', + Content: [ + 'Hello ', + { + TemplateOpen: '{{', + WsAfterOpen: ' ', + Content: "'user'", + WsAfterContent: ' ', + TemplateClose: '}}', + }, + ], + CloseQuote: '"', + }, + }, + { + AttributeKey: 'meta', + Equals: '=', + WsAfterEquals: ' ', + WsAfterKey: ' ', + quotedValue: { + CloseQuote: "'", + Content: [ + '{if', + { + Content: "nothing''", + TemplateClose: '}}', + TemplateOpen: '{{', + WsAfterContent: ' ', + }, + '123', + ], + OpenQuote: "'", + }, + }, + ], + }, + OpenTagCloseBracket: '>', + Content: { + TextContent: "'World'", + }, + CloseTag: { ClosingOpenBracket: '' }, + }, + }); + }); - // Unmatched tags should not error in cst stage + test('special characters and symbols in content', () => { + const input = `@#$%^&*(){}[]|\\:";'<>?/.,~\``; + const { node } = withParser(input, (p) => p.root()) as { node: CstRootNode }; + + expect(images(node)).toStrictEqual({ + Element: { + OpenTagPartial: { OpenBracket: '<', TagName: 'text' }, + OpenTagCloseBracket: '>', + TextContent: '@#$%^&*(){}[]|\\:";\'<>?/.,~`', + CloseTag: { ClosingOpenBracket: '' }, + }, + }); + }); + + test('multiple templates and elements mixed with whitespace', () => { + const input = ` {{ a }}
    {{ b }}
    {{ c }} `; + const { node } = withParser(input, (p) => p.root()) as { node: CstRootNode }; + + expect(images(node)).toStrictEqual([ + { TextContent: ' ' }, + { + Template: { + TemplateOpen: '{{', + WsAfterOpen: ' ', + Content: 'a', + WsAfterContent: ' ', + TemplateClose: '}}', + }, + }, + { TextContent: ' ' }, + { + Element: { + OpenTagPartial: { OpenBracket: '<', TagName: 'div' }, + OpenTagCloseBracket: '>', + Content: { + Template: { + TemplateOpen: '{{', + WsAfterOpen: ' ', + Content: 'b', + WsAfterContent: ' ', + TemplateClose: '}}', + }, + }, + CloseTag: { ClosingOpenBracket: '' }, + }, + }, + { TextContent: ' ' }, + { + Template: { + TemplateOpen: '{{', + WsAfterOpen: ' ', + Content: 'c', + WsAfterContent: ' ', + TemplateClose: '}}', + }, + }, + { TextContent: ' ' }, + ]); + }); +}); + +describe('Error', () => { + test('orphan closing tags should error', () => { + const input = `Some text{{ template }}`; + const { node, errors } = withParser(input, (p) => p.root(), false) as { node: CstRootNode; errors: any[] }; + expect(errors.length).toBe(4); + + expect(images(node)).toStrictEqual([ + { TextContent: 'Some text' }, + { TextContent: 'orphan' }, + { + Template: { + TemplateOpen: '{{', + WsAfterOpen: ' ', + Content: 'template', + WsAfterContent: ' ', + TemplateClose: '}}', + }, + }, + { TextContent: 'unknown' }, + ]); + }); + + test('mismatched tags - template opening with text closing', () => { + const input = `