diff --git a/.claude/settings.json b/.claude/settings.json
new file mode 100644
index 00000000..02e05248
--- /dev/null
+++ b/.claude/settings.json
@@ -0,0 +1,9 @@
+{
+ "permissions": {
+ "allow": [
+ "Bash(npm run lint)",
+ "Bash(npm run test*)",
+ "Read(~/.zshrc)"
+ ]
+ }
+}
\ No newline at end of file
diff --git a/docs/deep-dive/proposals/poml_extended.md b/docs/deep-dive/proposals/poml_extended.md
index 0cd9f0d8..ca650936 100644
--- a/docs/deep-dive/proposals/poml_extended.md
+++ b/docs/deep-dive/proposals/poml_extended.md
@@ -19,7 +19,7 @@ The current POML implementation requires files to be fully enclosed within `
+4. **Controlled Evolution of Tags**: behaviour of new/experimental tags is opt‑in via ``, preventing accidental breakage when upgrading the tool‑chain.
## File Format Specification
@@ -38,6 +38,7 @@ The system will assume the whole file is a pure text file and detects certain pa
1. Loading component definitions from `componentDocs.json` and extracting valid POML component names and their aliases.
2. Scanning for opening tags that match these components, and scanning until the corresponding closing tag is found.
3. If a special tag `...` is found within a POML segment, it will be treated as pure text content and processed following the rules above (step 1 and 2).
+4. Unknown or disabled tags are treated as literal text and, by default, raise a diagnostic warning.
An example is shown below:
@@ -104,81 +105,124 @@ There can be some intervening text here as well.
Metadatas are information that is useful when parsing and rendering the file, such as context variables, stylesheets, version information, file paths, etc.
File-level metadata can be included at any place of the file in a special `` tag. This metadata will be processed before any content parsing.
+By default, metadata has no child contents. When child contents exist, `` tag must have type to specify what kind of content is provided.
+
+**Example:**
+
+```xml
+
+
+
+
+
+{ "foo": "bar" }
+
+```
## Architecture Design
### High-level Processing Pipeline
-The core of the new architecture is a three-pass process: Segmentation, Metadata Extraction, and Recursive Rendering.
+The core of the new architecture is a three-pass process: Tokenization and AST Parsing, and Recursive Rendering.
-#### I. Segmentation Pass
+#### I. Tokenization and AST Parsing
-This initial pass is a crucial preprocessing step that scans the raw file content and partitions it into a hierarchical tree of segments. It does **not** parse the full XML structure of POML blocks; it only identifies their boundaries.
+This phase processes the raw file content into an Abstract Syntax Tree (AST). It leverages the provided ExtendedPomlLexer.
-- **Objective**: To classify every part of the file as `META`, `POML`, or `TEXT` and build a nested structure.
-- **Algorithm**:
- 1. Load all valid POML component tag names (including aliases) from `componentDocs.json`. This set of tags will be used for detection.
- 2. Initialize the root of the segment tree as a single, top-level `TEXT` segment spanning the entire file, unless the root segment is a single `...` block spanning the whole file (in which case it will be treated as a `POML` segment).
- 3. Use a stack-based algorithm to scan the text.
- - When an opening tag (e.g., ``) that matches a known POML component is found, push its name and start position onto the stack. This marks the beginning of a potential `POML` segment.
- - When a closing tag (e.g., ``) is found that matches the tag at the top of the stack, pop the stack. This marks a complete `POML` segment. This new segment is added as a child to the current parent segment in the tree.
- - The special `` tag is handled recursively. If a `` tag is found _inside_ a `POML` segment, the scanner will treat its content as a nested `TEXT` segment. This `TEXT` segment can, in turn, contain more `POML` children.
- - Any content not enclosed within identified `POML` tags remains part of its parent `TEXT` segment.
- 4. `` tags are treated specially. They are identified and parsed into `META` segments at any level but are logically hoisted and processed first. They should not have children.
+- **Tokenization**: The ExtendedPomlLexer (using chevrotain) scans the entire input string and breaks it into a flat stream of tokens (TagOpen, Identifier, TextContent, TemplateOpen, etc.). This single lexing pass is sufficient for the entire mixed-content file. The distinction between "text" and "POML" is not made at this stage; it's simply a stream of tokens.
+- **AST Parsing Algorithm**: A CST (Concrete Syntax Tree) or AST parser will consume the token stream from the lexer. The parser is stateful, using a `PomlContext` object to track parsing configurations.
+ 1. The parser starts in "text mode". It consumes TextContent, TemplateOpen/TemplateClose, and other non-tag tokens, bundling them into TEXT or TEMPLATE nodes.
+ 2. When a TagOpen (`<`) token is followed by the Identifier "meta", a META node is created. Its attributes are immediately parsed to populate the `PomlContext`. This allows metadata to control the parsing of the remainder of the file (e.g., by enabling new tags). The META node is added to the AST but will be ignored during rendering.
+ 3. When a TagOpen (`<`) token is followed by an Identifier that matches a known POML component (from componentDocs.json and enabled via PomlContext), the parser switches to "POML mode" and creates a POML node.
+ 4. In "POML mode," it parses attributes (Identifier, Equals, DoubleQuote/SingleQuote), nested tags, and content until it finds a matching TagClosingOpen (`<`) token. Template variables `{{}}` within attribute values or content are parsed into child TEMPLATE nodes.
+ 5. If the tag is ``, it creates a POML node for `` itself, but its _children_ are parsed by recursively applying the "text mode" logic (step 1), allowing for nested POML within ``.
+ 6. If a TagOpen is followed by an Identifier that is _not_ a known POML component, the parser treats these tokens (`<`, tagname, `>`) as literal text and reverts to "text mode".
+ 7. The parser closes the current POML node when the corresponding TagClosingOpen (`<`) and Identifier are found. After closing the top-level POML tag, it reverts to "text mode".
-- **Output**: A `Segment` tree. For backward compatibility, if the root segment is a single `...` block spanning the whole file, the system can revert to the original, simpler parsing model.
+- **Error Tolerance**: The parser will be designed to be error-tolerant. If a closing tag is missing, it can infer closure at the end of the file or when a new top-level tag begins, logging a diagnostic warning.
-**`Segment` Interface**: The `children` property is key to representing the nested structure of mixed-content files.
+- **Source Mapping**: The chevrotain tokens inherently contain offset, line, and column information. This data is directly transferred to the ASTNode during parsing, enabling robust code intelligence features.
+
+- **Output**: An AST representing the hierarchical structure of the document, where each node contains source position information and type metadata.
+
+**`ASTNode` Interface**: The AST nodes represent the parsed structure with source mapping.
```typescript
-interface Segment {
- id: string; // Unique ID for caching and React keys
- kind: 'META' | 'TEXT' | 'POML';
+interface SourceRange {
start: number;
end: number;
- content: string; // The raw string content of the segment
- parent?: Segment; // Reference to the parent segment
- children: Segment[]; // Nested segments (e.g., a POML block within text)
- tagName?: string; // For POML segments, the name of the root tag (e.g., 'task')
}
-```
-#### II. Metadata Processing
-
-Once the segment tree is built, all `META` segments are processed.
+interface AttributeInfo {
+ key: string;
+ value: (ASTNode & { kind: 'TEXT' | 'TEMPLATE' })[]; // Mixed content: array of text/template nodes
+ keyRange: SourceRange; // Position of attribute name
+ valueRange: SourceRange; // Position of attribute value (excluding quotes)
+ fullRange: SourceRange; // Full attribute including key="value"
+}
-- **Extraction**: Traverse the tree to find all `META` segments.
-- **Population**: Parse the content of each `` tag and populate the global `PomlContext` object.
-- **Removal**: After processing, `META` segments are removed from the tree to prevent them from being rendered.
+interface ASTNode {
+ id: string; // Unique ID for caching and React keys
+ kind: 'META' | 'TEXT' | 'POML' | 'TEMPLATE';
+ start: number; // Source position start of entire node
+ end: number; // Source position end of entire node
+ content: string; // The raw string content
+ parent?: ASTNode; // Reference to the parent node
+ children: ASTNode[]; // Child nodes
+
+ // For POML and META nodes
+ tagName?: string; // Tag name (e.g., 'task', 'meta')
+ attributes?: AttributeInfo[]; // Detailed attribute information
+
+ // Detailed source positions
+ openingTag?: {
+ start: number; // Position of '<'
+ end: number; // Position after '>'
+ nameRange: SourceRange; // Position of tag name
+ };
+
+ closingTag?: {
+ start: number; // Position of ''
+ end: number; // Position after '>'
+ nameRange: SourceRange; // Position of tag name in closing tag
+ };
+
+ contentRange?: SourceRange; // Position of content between tags (excluding nested tags)
+
+ // For TEXT nodes
+ textSegments?: SourceRange[]; // Multiple ranges for text content (excluding nested POML)
+
+ // For TEMPLATE nodes
+ expression?: string; // The full expression content between {{}}
+}
+```
**`PomlContext` Interface**: This context object is the single source of truth for the entire file, passed through all readers. It's mutable, allowing stateful operations like `` to have a file-wide effect.
```typescript
interface PomlContext {
variables: { [key: string]: any }; // For {{ substitutions }} and (Read/Write)
- texts: { [key: string]: React.ReactElement }; // Maps TEXT_ID to content for replacement (Read/Write)
stylesheet: { [key: string]: string }; // Merged styles from all tags (Read-Only during render)
minimalPomlVersion?: string; // From (Read-Only)
sourcePath: string; // File path for resolving includes (Read-Only)
}
```
-#### III. Text/POML Dispatching (Recursive Rendering)
+#### II. Text/POML Dispatching (Recursive Rendering)
-Rendering starts at the root of the segment tree and proceeds recursively. A controller dispatches segments to the appropriate reader.
+Rendering starts at the root of the AST and proceeds recursively. A controller dispatches AST nodes to the appropriate reader.
-- **`PureTextReader`**: Handles `TEXT` segments.
+- **`PureTextReader`**: Handles `TEXT` nodes.
- Currently we directly render the pure-text contents as a single React element. In future, we can:
- Renders the text content, potentially using a Markdown processor.
- Performs variable substitutions (`{{...}}`) using the `variables` from `PomlContext`. The logic from `handleText` in the original `PomlFile` should be extracted into a shared utility for this.
- - Iterates through its `children` segments. For each child `POML` segment, it calls the `PomlReader`.
+ - Iterates through its `children` nodes. For each child `POML` node, it calls the `PomlReader`.
-- **`PomlReader`**: Handles `POML` segments.
- - **Pre-processing**: Before parsing, it replaces any direct child `` regions with a self-closing placeholder tag containing a unique ID: ``. The original content of the `` segment is stored in `context.texts`. This ensures the XML parser inside `PomlFile` doesn't fail on non-XML content (like Markdown).
- - **Delegation**: Instantiates a modified `PomlFile` class with the processed segment content and the shared `PomlContext`.
- - **Rendering**: Calls the `pomlFile.react(context)` method to render the segment.
+- **`PomlReader`**: Handles `POML` nodes.
+ - **Delegation**: Instantiates a modified `PomlFile` class with the processed node content and the shared `PomlContext`.
+ - **Rendering**: Calls the `pomlFile.react(context)` method to render the node.
-- **`IntelliSense Layer`**: The segment tree makes it easy to provide context-aware IntelliSense. By checking the `kind` of the segment at the cursor's offset, the request can be routed to the correct provider—either the `PomlReader`'s XML-aware completion logic or a simpler text/variable completion provider for `TEXT` segments.
+- **`IntelliSense Layer`**: The AST makes it easy to provide context-aware IntelliSense. By checking the `kind` of the node at the cursor's offset, the request can be routed to the correct provider—either the `PomlReader`'s XML-aware completion logic or a simpler text/variable completion provider for `TEXT` nodes.
**`Reader` Interface**: This interface defines the contract for both `PureTextReader` and `PomlReader`.
@@ -208,11 +252,3 @@ To achieve this design, the existing `PomlFile` class needs significant refactor
3. **Handling ``**:
- The `handleInclude` method should be **removed** from `PomlFile`. Inclusion is now handled at a higher level by the main processing pipeline. When the `PomlReader` encounters an `` tag, it will invoke the entire pipeline (Segmentation, Metadata, Rendering) on the included file and insert the resulting React elements.
-
-4. **Parsing `TEXT` Placeholders**:
-
-- The core `parseXmlElement` method needs a new branch to handle the `` placeholder.
-- When it encounters this element:
- 1. It extracts the `ref` attribute (e.g., `"TEXT_ID_123"`).
- 2. It looks up the corresponding raw text from `context.texts`.
- 3. It fetches from the `context.texts` map and returns a React element containing the pure text content.
diff --git a/package-lock.json b/package-lock.json
index 81ad5efb..9dd58aca 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -23,6 +23,7 @@
"cheerio": "^1.0.0",
"closest-match": "^1.3.3",
"d3-dsv": "~2.0.0",
+ "he": "^1.2.0",
"jquery": "^3.7.1",
"js-tiktoken": "^1.0.20",
"js-yaml": "^4.1.0",
@@ -50,6 +51,7 @@
"@rollup/plugin-json": "^6.1.0",
"@stylistic/eslint-plugin": "^5.2.3",
"@types/d3-dsv": "~2.0.0",
+ "@types/he": "^1.2.3",
"@types/jquery": "^3.5.32",
"@types/js-yaml": "^4.0.9",
"@types/lodash.throttle": "^4.1.9",
@@ -3840,6 +3842,13 @@
"@types/unist": "*"
}
},
+ "node_modules/@types/he": {
+ "version": "1.2.3",
+ "resolved": "https://registry.npmjs.org/@types/he/-/he-1.2.3.tgz",
+ "integrity": "sha512-q67/qwlxblDzEDvzHhVkwc1gzVWxaNxeyHUBF4xElrvjL11O+Ytze+1fGpBHlr/H9myiBUaUXNnNPmBHxxfAcA==",
+ "dev": true,
+ "license": "MIT"
+ },
"node_modules/@types/istanbul-lib-coverage": {
"version": "2.0.6",
"resolved": "https://registry.npmjs.org/@types/istanbul-lib-coverage/-/istanbul-lib-coverage-2.0.6.tgz",
@@ -8114,7 +8123,6 @@
"version": "1.2.0",
"resolved": "https://registry.npmjs.org/he/-/he-1.2.0.tgz",
"integrity": "sha512-F/1DnUGPopORZi0ni+CvrCgHQ5FyEAHRLSApuYWMmrbSwoN2Mn/7k+Gl38gJnR7yyDZk6WLXwiGod1JOWNDKGw==",
- "dev": true,
"license": "MIT",
"bin": {
"he": "bin/he"
diff --git a/package.json b/package.json
index c86ddc13..0a7a0142 100644
--- a/package.json
+++ b/package.json
@@ -408,6 +408,7 @@
"@rollup/plugin-json": "^6.1.0",
"@stylistic/eslint-plugin": "^5.2.3",
"@types/d3-dsv": "~2.0.0",
+ "@types/he": "^1.2.3",
"@types/jquery": "^3.5.32",
"@types/js-yaml": "^4.0.9",
"@types/lodash.throttle": "^4.1.9",
@@ -460,6 +461,7 @@
"cheerio": "^1.0.0",
"closest-match": "^1.3.3",
"d3-dsv": "~2.0.0",
+ "he": "^1.2.0",
"jquery": "^3.7.1",
"js-tiktoken": "^1.0.20",
"js-yaml": "^4.1.0",
diff --git a/packages/poml/base.tsx b/packages/poml/base.tsx
index ed6efb50..c894c625 100644
--- a/packages/poml/base.tsx
+++ b/packages/poml/base.tsx
@@ -137,7 +137,10 @@ export interface PropsBase {
// Experimental
writerOptions?: object;
- whiteSpace?: 'pre' | 'filter' | 'trim';
+ whiteSpace?: 'pre' | 'filter' | 'trim' | 'collapse';
+
+ // Enforce inline on every element.
+ inline?: boolean;
/** Soft character limit before truncation is applied. */
charLimit?: number;
@@ -996,3 +999,9 @@ export function findComponentByAliasOrUndefined(alias: string, disabled?: Set c.getAliases())
+ .flat();
+}
diff --git a/packages/poml/next/ast.ts b/packages/poml/next/ast.ts
new file mode 100644
index 00000000..367972f7
--- /dev/null
+++ b/packages/poml/next/ast.ts
@@ -0,0 +1,416 @@
+/**
+ * Converting CST nodes to AST nodes.
+ *
+ * It's time for:
+ *
+ * - Check open/close tag matching
+ * - Deal with HTML entities escape and backslash escape
+ * - Concatenate wrongly split text into LiteralNode
+ * - Unify the types (e.g., AttributeNode must have ValueNode children)
+ *
+ * It's not time yet for:
+ *
+ * - Evaluating expressions in templates
+ * - Resolving includes
+ * - Validating semantics (e.g., whether an attribute is allowed on a certain element)
+ */
+
+import { CstNode, IToken } from 'chevrotain';
+import * as he from 'he';
+import {
+ CstRootNode,
+ CstElementContentNode,
+ CstElementNode,
+ CstTemplateNode,
+ CstQuotedNode,
+ CstQuotedTemplateNode,
+ CstForIteratorNode,
+ CstAttributeNode,
+ CstCommentNode,
+ CstPragmaNode,
+ CstTokens,
+ AstNode,
+} from './nodes';
+import {
+ ElementNode,
+ ElementContentNode,
+ ValueNode,
+ TemplateNode,
+ LiteralNode,
+ AttributeNode,
+ ForIteratorNode,
+ CommentNode,
+ PragmaNode,
+ RootNode,
+} from './nodes';
+import { Range } from './types';
+import { extendedPomlParser } from './cst';
+import { BackslashEscape, CharacterEntity } from './lexer';
+import * as diagnostics from './diagnostics';
+
+/** Decode a single backslash escape sequence (for quoted strings). */
+function decodeEscape(seq: string): string {
+ // seq includes the leading backslash (e.g. " , \n)
+ const body = seq.slice(1);
+ if (body === 'n') {
+ return '\n';
+ } else if (body === 'r') {
+ return '\r';
+ } else if (body === 't') {
+ return '\t';
+ } else if (body === "'") {
+ return "'";
+ } else if (body === '"') {
+ return '"';
+ } else if (body === '{{') {
+ // \{{
+ return '{{';
+ } else if (body === '}}') {
+ // \}}
+ return '}}';
+ } else if (body.startsWith('x')) {
+ // \xHH (2 hex digits)
+ const hex = body.slice(1);
+ if (hex.length === 2 && /^[0-9a-fA-F]{2}$/.test(hex)) {
+ const n = parseInt(hex, 16);
+ return String.fromCharCode(n);
+ }
+ return body; // Invalid hex escape
+ } else if (body.startsWith('u')) {
+ // \uHHHH (4 hex digits)
+ const hex = body.slice(1);
+ if (hex.length === 4 && /^[0-9a-fA-F]{4}$/.test(hex)) {
+ const n = parseInt(hex, 16);
+ return String.fromCharCode(n);
+ }
+ return body; // Invalid unicode escape
+ } else if (body.startsWith('U')) {
+ // \UHHHHHHHH (8 hex digits)
+ const hex = body.slice(1);
+ if (hex.length === 8 && /^[0-9a-fA-F]{8}$/.test(hex)) {
+ const n = parseInt(hex, 16);
+ return String.fromCodePoint(n);
+ }
+ return body; // Invalid unicode escape
+ } else if (body === '\\') {
+ return '\\';
+ } else {
+ // Unknown escape, return the sequence as-is minus the leading backslash (best effort)
+ return body;
+ }
+}
+
+// ---- Range and text utilities ----
+
+/** Utility: create a LiteralNode from raw text and token range. */
+function literal(value: string, range: Range): LiteralNode {
+ return { kind: 'STRING', value, range };
+}
+
+/**
+ * Create a LiteralNode from IToken list.
+ */
+function literalFromTokens(tokens: IToken[], fromIToken?: (tokens: IToken[]) => string): LiteralNode {
+ const text = fromIToken ? fromIToken(tokens) : textFromRaw(tokens);
+ return literal(text, rangeFromTokens(tokens));
+}
+
+/**
+ * Convert CST token groups to a literal string.
+ * String contents are kept as is, no escape decoding.
+ */
+function literalFromCstTokens(groups: CstTokens[], fromIToken?: (tokens: IToken[]) => string): LiteralNode {
+ const text = textFromCstTokens(groups, fromIToken ?? textFromRaw);
+ return literal(text, rangeFromCstTokens(groups));
+}
+
+/**
+ * Range utilities.
+ * Build a range from two offsets (inclusive start, inclusive end).
+ */
+function rangeFrom(start: number, end: number): Range {
+ return { start, end };
+}
+
+/**
+ * Range that spans a list of tokens (or is [0, 0] if none).
+ */
+function rangeFromTokens(tokens: IToken[]): Range {
+ if (!tokens.length) {
+ return { start: 0, end: 0 };
+ }
+ const first = tokens[0];
+ const last = tokens[tokens.length - 1];
+ return rangeFrom(first.startOffset ?? 0, last.endOffset ?? first.startOffset ?? 0);
+}
+
+/**
+ * Range from Any CstNode (or is [0, 0] if none).
+ */
+function rangeFromCstNode(node: CstNode): Range {
+ const start = node.location?.startOffset ?? 0;
+ const end = node.location?.endOffset ?? node.location?.startOffset ?? start;
+ return rangeFrom(start, end);
+}
+
+/**
+ * Range that spans a list of CstTokens (or is [0, 0] if none).
+ */
+function rangeFromCstTokens(groups: CstTokens[]): Range {
+ const allTokens = groups.flatMap((g) => g.children.Content ?? []);
+ return rangeFromTokens(allTokens);
+}
+
+/** Gather raw text from a list of tokens without any decoding. */
+function textFromRaw(tokens: IToken[]): string {
+ return tokens.map((t) => t.image ?? '').join('');
+}
+
+/**
+ * Gather text from tokens INSIDE QUOTED STRINGS (attribute values & pragma quoted options).
+ * Rules:
+ * - Backslash escapes ARE decoded
+ * - Character entities are shown as-is (not decoded)
+ */
+function textFromQuoted(tokens: IToken[]): string {
+ return tokens
+ .map((t) => {
+ if (t.tokenType === BackslashEscape) {
+ return decodeEscape(t.image ?? '');
+ } else {
+ return t.image;
+ }
+ })
+ .join('');
+}
+
+/**
+ * Gather text from CstToken groups.
+ * Each group is expected to be a list of ITokens.
+ */
+function textFromCstTokens(groups: CstTokens[], fromIToken: (tokens: IToken[]) => string): string {
+ return groups.map((g) => fromIToken(g.children.Content ?? [])).join('');
+}
+
+// ---- AST Visitor ----
+
+const BaseVisitor = extendedPomlParser.getBaseCstVisitorConstructorWithDefaults();
+
+/**
+ * Extended POML CST -> AST builder.
+ *
+ * This visitor performs a shape-preserving transformation from the concrete
+ * syntax tree (CST) to the semantic abstract syntax tree (AST). It also
+ * normalizes textual content according to the lexer/parser contracts:
+ * - between-tags text decodes character entities (& -> &)
+ * - quoted strings decode backslash escapes (\n, \xHH, \uHHHH, ...)
+ * - template expressions are preserved as raw text; evaluation is later
+ *
+ * It additionally checks that open/close tag names match and records errors
+ * instead of throwing where possible so downstream phases can proceed.
+ */
+export class ExtendedPomlAstVisitor extends BaseVisitor {
+ constructor() {
+ super();
+ this.validateVisitor();
+ }
+
+ /**
+ * A hack to let rule methods get a handle of the CstNode they are visiting.
+ */
+ visit(cstNode: CstNode | CstNode[], param?: any): AstNode {
+ return super.visit(cstNode, { ...param, node: cstNode });
+ }
+
+ // ---- Rule implementations ----
+
+ root(ctx: CstRootNode['children'], { node }: { node: CstRootNode }): RootNode {
+ const children: ElementContentNode[] = [];
+ for (const ec of ctx.Content ?? []) {
+ const node = this.visit(ec) as ElementContentNode;
+ if (node) {
+ children.push(node);
+ }
+ }
+
+ return { kind: 'ROOT', children, range: rangeFromCstNode(node) };
+ }
+
+ elementContent(
+ ctx: CstElementContentNode['children'],
+ { node }: { node: CstElementContentNode },
+ ): ElementContentNode {
+ if (ctx.Pragma?.length) {
+ return this.visit(ctx.Pragma[0]) as PragmaNode;
+ } else if (ctx.Comment?.length) {
+ return this.visit(ctx.Comment[0]) as CommentNode;
+ } else if (ctx.Template?.length) {
+ return this.visit(ctx.Template[0]) as TemplateNode;
+ } else if (ctx.Element?.length) {
+ return this.visit(ctx.Element[0]) as ElementNode;
+ } else if (ctx.TextContent?.length) {
+ // Text contents between tags
+ return this.visit(ctx.TextContent[0]) as LiteralNode;
+ }
+ // This should not happen
+ diagnostics.error('Unknown element content', rangeFromCstNode(node));
+ return literal('', rangeFromCstNode(node));
+ }
+
+ template(ctx: CstTemplateNode['children'], { node }: { node: CstTemplateNode }): TemplateNode {
+ const exprNode = literalFromCstTokens(ctx.Content ?? []);
+ return { kind: 'TEMPLATE', value: exprNode, range: rangeFromCstNode(node) };
+ }
+
+ comment(ctx: CstCommentNode['children'], { node }: { node: CstCommentNode }): CommentNode {
+ return {
+ kind: 'COMMENT',
+ value: literalFromCstTokens(ctx.Content ?? []),
+ range: rangeFromCstNode(node),
+ };
+ }
+
+ pragma(ctx: CstPragmaNode['children'], { node }: { node: CstPragmaNode }): PragmaNode {
+ const identifier = literalFromTokens(ctx.PragmaIdentifier ?? []);
+ const options: LiteralNode[] = [];
+
+ for (const option of ctx.PragmaOption ?? []) {
+ if ('tokenType' in option) {
+ // IToken
+ options.push(literal(option.image ?? '', rangeFromTokens([option])));
+ } else {
+ // CstQuotedNode
+ options.push(this.visit(option) as LiteralNode);
+ }
+ }
+
+ return {
+ kind: 'PRAGMA',
+ identifier,
+ options,
+ range: rangeFromCstNode(node),
+ };
+ }
+
+ quoted(ctx: CstQuotedNode['children'], { node }: { node: CstQuotedNode }): LiteralNode {
+ // Ignore the special strings like templates, entities, ...
+ return literalFromCstTokens(ctx.Content ?? [], textFromQuoted);
+ }
+
+ quotedTemplate(ctx: CstQuotedTemplateNode['children'], { node }: { node: CstQuotedTemplateNode }): ValueNode {
+ const children: (LiteralNode | TemplateNode)[] = [];
+
+ for (const content of ctx.Content ?? []) {
+ if (content.name === 'template') {
+ // CstTemplateNode
+ const templateNode = this.visit(content) as TemplateNode;
+ children.push(templateNode);
+ } else {
+ // CstTokens - regular text content
+ const lit = literalFromCstTokens([content as CstTokens], textFromQuoted);
+ children.push(lit);
+ }
+ }
+
+ return {
+ kind: 'VALUE',
+ children,
+ range: rangeFromCstNode(node),
+ };
+ }
+
+ forIteratorValue(ctx: CstForIteratorNode['children'], { node }: { node: CstForIteratorNode }): ForIteratorNode {
+ const iterator = literalFromTokens(ctx.Iterator ?? [], textFromQuoted);
+ const collection = literalFromCstTokens(ctx.Collection ?? [], textFromQuoted);
+
+ return {
+ kind: 'FORITERATOR',
+ iterator,
+ collection,
+ range: rangeFromCstNode(node),
+ };
+ }
+
+ attribute(ctx: CstAttributeNode['children'], { node }: { node: CstAttributeNode }): AttributeNode {
+ const key: LiteralNode = literalFromTokens(ctx.AttributeKey ?? []);
+ const range = rangeFromCstNode(node);
+
+ let value: ValueNode | ForIteratorNode;
+
+ if (ctx.forIteratorValue?.length) {
+ value = this.visit(ctx.forIteratorValue[0]) as ForIteratorNode;
+ } else if (ctx.quotedValue?.length) {
+ value = this.visit(ctx.quotedValue[0]) as ValueNode;
+ } else if (ctx.templatedValue?.length) {
+ // Unquoted: key={{ expr }} -> wrap as ValueNode with a TemplateNode child
+ const tpl = this.visit(ctx.templatedValue[0]) as TemplateNode;
+ value = { kind: 'VALUE', children: [tpl], range: tpl.range };
+ } else {
+ // Fallback empty value
+ diagnostics.error(`Attribute "${key.value}" is missing a value`, range);
+ value = { kind: 'VALUE', children: [], range: key.range };
+ }
+
+ return { kind: 'ATTRIBUTE', key, value, range };
+ }
+
+ /**
+ * Gather text from tokens for TEXT CONTENT (between tags).
+ * Rules:
+ * - Character entities are decoded
+ * - Backslash escapes are NOT interpreted (shown as-is)
+ */
+ betweenTagsTokens(ctx: CstTokens['children'], { node }: { node: CstTokens }): LiteralNode {
+ const tokens = ctx.Content ?? [];
+ const text = tokens
+ .map((t) => {
+ if (t.tokenType === CharacterEntity) {
+ try {
+ return he.decode(t.image ?? '', { strict: true });
+ } catch (e) {
+ diagnostics.error(`Failed to decode HTML entity: ${t.image}`, rangeFromTokens([t]));
+ }
+ }
+ return t.image ?? '';
+ })
+ .join('');
+ return literal(text, rangeFromTokens(tokens));
+ }
+
+ // openTagPartial and closeTag is skipped. They are handled implicitly in element()
+
+ element(ctx: CstElementNode['children'], { node }: { node: CstElementNode }): ElementNode {
+ const openTagPartial = ctx.OpenTagPartial?.[0];
+ const name = textFromRaw(openTagPartial?.children?.TagName ?? []);
+
+ const attributes = openTagPartial?.children?.Attribute?.map((a) => this.visit(a) as AttributeNode) ?? [];
+
+ let children: ElementContentNode[];
+
+ if (ctx.TextContent?.length) {
+ // Literal element: everything inside is plain text (no template interpolation)
+ children = [literalFromCstTokens(ctx.TextContent)];
+ } else {
+ // Normal element: nested content parsed as usual
+ children = ctx.Content?.map((ec) => this.visit(ec) as ElementContentNode) ?? [];
+ }
+
+ // Tag name matching check
+ const closeTag = ctx.CloseTag?.[0];
+ const closeTagName = textFromRaw(closeTag?.children?.TagName ?? []);
+ if (closeTag && name.toLowerCase() !== closeTagName.toLowerCase()) {
+ diagnostics.error(
+ `Mismatched closing tag: expected ${name}> but found ${closeTagName}>`,
+ rangeFromCstNode(closeTag),
+ );
+ }
+
+ return { kind: 'ELEMENT', name, attributes, children, range: rangeFromCstNode(node) };
+ }
+}
+
+/** Build an AST RootNode from a CST produced by the parser. */
+export function cstToAst(cst: CstNode): RootNode {
+ const visitor = new ExtendedPomlAstVisitor();
+ return visitor.visit(cst) as RootNode;
+}
diff --git a/packages/poml/reader/base.tsx b/packages/poml/next/base.tsx
similarity index 100%
rename from packages/poml/reader/base.tsx
rename to packages/poml/next/base.tsx
diff --git a/packages/poml/next/context.ts b/packages/poml/next/context.ts
new file mode 100644
index 00000000..bee95045
--- /dev/null
+++ b/packages/poml/next/context.ts
@@ -0,0 +1,33 @@
+/** One manager per POML compile (nested files do not count) */
+export type Context = { [key: string]: T };
+export class ContextManager {
+ private contextStore: { [key: string]: T } = {};
+ private stack: Array<{ [key: string]: T }> = [];
+
+ public initialize(initialContext: { [key: string]: T }) {
+ this.contextStore = { ...initialContext };
+ this.stack = [];
+ }
+
+ public setGlobalVariable(key: string, value: T) {
+ this.contextStore[key] = value;
+ }
+
+ public setLocalVariable(key: string, value: T) {
+ if (this.stack.length === 0) {
+ throw new Error('No local stack available');
+ }
+ this.stack[this.stack.length - 1][key] = value;
+ }
+
+ public pushStack(context: Context) {
+ this.stack.push({ ...context });
+ }
+
+ public popStack() {
+ if (this.stack.length === 0) {
+ throw new Error('No local stack to pop');
+ }
+ this.stack.pop();
+ }
+}
diff --git a/packages/poml/next/cst.ts b/packages/poml/next/cst.ts
new file mode 100644
index 00000000..c025b032
--- /dev/null
+++ b/packages/poml/next/cst.ts
@@ -0,0 +1,552 @@
+import { CstParser, CstNode, IToken, TokenType } from 'chevrotain';
+import {
+ AllTokens,
+ TokensComment,
+ TokensExpression,
+ TokensDoubleQuoted,
+ TokensSingleQuoted,
+ TokensCommentIdentifiers,
+ TokensDoubleQuotedExpression,
+ TokensSingleQuotedExpression,
+ TokensTextContent,
+ CommentOpen,
+ CommentClose,
+ PragmaKeyword,
+ TemplateOpen,
+ TemplateClose,
+ ClosingOpenBracket,
+ SelfCloseBracket,
+ OpenBracket,
+ CloseBracket,
+ Equals,
+ DoubleQuote,
+ SingleQuote,
+ Whitespace,
+ Identifier,
+ extendedPomlLexer,
+} from './lexer';
+
+import {
+ CstTokens,
+ CstTemplateNode,
+ CstQuotedNode,
+ CstQuotedTemplateNode,
+ CstForIteratorNode,
+ CstAttributeNode,
+ CstOpenTagPartialNode,
+ CstCloseTagNode,
+ CstElementNode,
+ CstElementContentNode,
+ CstCommentNode,
+ CstPragmaNode,
+ CstRootNode,
+} from './nodes';
+import { listComponentAliases } from 'poml/base';
+
+/**
+ * Extended POML CST Parser
+ *
+ * Matches the CST shapes declared in nodes.ts.
+ * Rules are declared as class properties so TypeScript "sees" them.
+ * Labels are used **only** where the CST interfaces require custom names
+ * different from token/rule names (e.g., TagName, WsAfter*, TextContent, etc.).
+ */
+export class ExtendedPomlParser extends CstParser {
+ // ---- Rule property declarations (so TS knows they exist) ----
+ public root!: (idxInOriginalText?: number) => CstRootNode;
+ public elementContent!: (idxInOriginalText?: number) => CstElementContentNode;
+ // token-sequence helper rules
+ public commentTokens!: (idxInOriginalText?: number) => CstTokens;
+ public expressionTokens!: (idxInOriginalText?: number) => CstTokens;
+ public commentIdentifierTokens!: (idxInOriginalText?: number) => CstTokens;
+ public doubleQuotedTokens!: (idxInOriginalText?: number) => CstTokens;
+ public singleQuotedTokens!: (idxInOriginalText?: number) => CstTokens;
+ public doubleQuotedTrimmedTokens!: (idxInOriginalText?: number) => CstTokens;
+ public singleQuotedTrimmedTokens!: (idxInOriginalText?: number) => CstTokens;
+ public doubleQuotedExpressionTokens!: (idxInOriginalText?: number) => CstTokens;
+ public singleQuotedExpressionTokens!: (idxInOriginalText?: number) => CstTokens;
+ public betweenTagsTokens!: (idxInOriginalText?: number) => CstTokens;
+ // Accepting expectedTagName as argument to validate matching close tag
+ public literalTagTokens!: (idxInOriginalText?: number, args?: [string]) => CstTokens;
+ // regular rules
+ public template!: (idxInOriginalText?: number) => CstTemplateNode;
+ public comment!: (idxInOriginalText?: number) => CstCommentNode;
+ public pragma!: (idxInOriginalText?: number) => CstPragmaNode;
+ public quoted!: (idxInOriginalText?: number) => CstQuotedNode;
+ public quotedTemplate!: (idxInOriginalText?: number) => CstQuotedTemplateNode;
+ public forIteratorValue!: (idxInOriginalText?: number) => CstForIteratorNode;
+ public attribute!: (idxInOriginalText?: number) => CstAttributeNode;
+ public openTagPartial!: (idxInOriginalText?: number) => CstOpenTagPartialNode;
+ public closeTag!: (idxInOriginalText?: number) => CstCloseTagNode;
+ public element!: (idxInOriginalText?: number) => CstElementNode;
+
+ // ---- Tag names for rules (for CST nodes) ----
+ private validComponentNames: Set;
+
+ // They are handled in file.tsx currently.
+ // I think they will be gradually moved to component registry in future.
+ private validDirectives: Set = new Set([
+ 'include',
+ 'let',
+ 'output-schema',
+ 'outputschema',
+ 'tool-definition',
+ 'tool-def',
+ 'tooldef',
+ 'tool',
+ 'template',
+ ]);
+ // This list affects the CST parser stage only.
+ private literalTagNames: Set = new Set(['text', 'template']);
+
+ // ---- Small helpers ----
+ private anyOf = (tokenTypes: TokenType[], label?: string) =>
+ tokenTypes.map((tt) => ({
+ ALT: () => (label ? this.CONSUME(tt, { LABEL: label }) : this.CONSUME(tt)),
+ }));
+
+ // Lookahead helper: Check if next is whitespace but next non-whitespace token is not of given type
+ private atAlmostClose = (tokenType: TokenType) => {
+ if (this.LA(1).tokenType === Whitespace) {
+ return this.LA(2).tokenType === tokenType;
+ }
+ return this.LA(1).tokenType === tokenType;
+ };
+
+ private isNextPragma = () => {
+ if (this.LA(1).tokenType !== CommentOpen) {
+ return false;
+ }
+ if (this.LA(2).tokenType === Whitespace) {
+ return this.LA(3).tokenType === PragmaKeyword;
+ }
+ return this.LA(2).tokenType === PragmaKeyword;
+ };
+
+ private isAtLiteralClose = (expectedTagName: string | undefined) => {
+ if (this.LA(1).tokenType !== ClosingOpenBracket) {
+ return false;
+ }
+ const t = this.LA(2).tokenType === Whitespace ? this.LA(3) : this.LA(2);
+ if (t.tokenType !== Identifier) {
+ return false;
+ }
+ const name = (t.image || '').toLowerCase();
+
+ return name === expectedTagName?.toLowerCase();
+ };
+
+ private peekTagName = (): string | undefined => {
+ if (this.LA(1).tokenType !== OpenBracket) {
+ return undefined;
+ }
+ const token = this.LA(2).tokenType === Whitespace ? this.LA(3) : this.LA(2);
+ if (token.tokenType !== Identifier) {
+ return undefined;
+ }
+ return token.image;
+ };
+
+ private isValidOpenTag = (tagName: string) => {
+ // When pragma strict is enabled, only known component names are allowed as tags.
+ // Other component names will show as errors in the semantic analysis stage.
+ // When pragma strict is not enabled, tag names that are not known components
+ // will be treated as texts.
+ return this.validComponentNames.has(tagName.toLowerCase());
+ };
+
+ constructor() {
+ super(AllTokens, {
+ recoveryEnabled: true,
+ nodeLocationTracking: 'full',
+ maxLookahead: 3,
+ });
+ this.validComponentNames = new Set(listComponentAliases());
+
+ // ---------------------------
+ // RULE DEFINITIONS (as properties)
+ // ---------------------------
+
+ this.root = this.RULE('root', () => {
+ // CstRootNode: { Content?: CstElementContentNode[] }
+ this.MANY(() => {
+ this.SUBRULE(this.elementContent, { LABEL: 'Content' });
+ });
+ });
+
+ this.elementContent = this.RULE('elementContent', () => {
+ this.OR([
+ // pragma (must come before raw comment)
+ {
+ GATE: this.isNextPragma,
+ ALT: () => this.SUBRULE(this.pragma, { LABEL: 'Pragma' }),
+ },
+ // regular comment
+ {
+ ALT: () => this.SUBRULE(this.comment, { LABEL: 'Comment' }),
+ },
+
+ // template
+ {
+ GATE: () => this.LA(1).tokenType === TemplateOpen,
+ ALT: () => this.SUBRULE(this.template, { LABEL: 'Template' }),
+ },
+
+ // normal element
+ {
+ ALT: () => this.SUBRULE(this.element, { LABEL: 'Element' }),
+ },
+
+ // raw text content
+ {
+ ALT: () => {
+ // Group text between tags under CstBetweenTagsTokens
+ this.SUBRULE(this.betweenTagsTokens, { LABEL: 'TextContent' });
+ },
+ },
+ ]);
+ });
+
+ // ----- Token sequence helper rules -----
+ this.commentTokens = this.RULE('commentTokens', () => {
+ // Can be empty
+ this.MANY(() => {
+ this.OR(this.anyOf(TokensComment, 'Content'));
+ });
+ });
+
+ this.commentIdentifierTokens = this.RULE('commentIdentifierTokens', () => {
+ // Used in @pragma options without quotes.
+ this.AT_LEAST_ONE(() => {
+ this.OR(this.anyOf(TokensCommentIdentifiers, 'Content'));
+ });
+ });
+
+ this.expressionTokens = this.RULE('expressionTokens', () => {
+ // Always trim the ws around the expression {{ expr }}.
+ // Must be non-empty.
+ this.AT_LEAST_ONE({
+ GATE: () => !this.atAlmostClose(TemplateClose),
+ DEF: () => {
+ this.OR(this.anyOf(TokensExpression, 'Content'));
+ },
+ });
+ });
+
+ this.doubleQuotedTokens = this.RULE('doubleQuotedTokens', () => {
+ // The untrimmed content within "...", can be empty.
+ this.MANY(() => {
+ this.OR(this.anyOf(TokensDoubleQuoted, 'Content'));
+ });
+ });
+
+ this.singleQuotedTokens = this.RULE('singleQuotedTokens', () => {
+ // The untrimmed content in '...', can be empty.
+ this.MANY(() => {
+ this.OR(this.anyOf(TokensSingleQuoted, 'Content'));
+ });
+ });
+
+ this.doubleQuotedTrimmedTokens = this.RULE('doubleQuotedTrimmedTokens', () => {
+ // Trimmed content in "..." without leading/trailing whitespace
+ // Must be non-empty.
+ // Greedily match until the next double quote (allow inner whitespace)
+ this.AT_LEAST_ONE({
+ GATE: () => !this.atAlmostClose(DoubleQuote),
+ DEF: () => {
+ this.OR(this.anyOf(TokensDoubleQuoted, 'Content'));
+ },
+ });
+ });
+
+ this.singleQuotedTrimmedTokens = this.RULE('singleQuotedTrimmedTokens', () => {
+ // Trimmed content without leading/trailing whitespace
+ // Must be non-empty.
+ // Greedily match until the next single quote (allow inner whitespace)
+ this.AT_LEAST_ONE({
+ GATE: () => !this.atAlmostClose(SingleQuote),
+ DEF: () => {
+ this.OR(this.anyOf(TokensSingleQuoted, 'Content'));
+ },
+ });
+ });
+
+ this.doubleQuotedExpressionTokens = this.RULE('doubleQuotedExpressionTokens', () => {
+ // Contents in "...{{ ... }}..." but outside the {{ }}
+ // Must be non-empty. Can have leading/trailing whitespace.
+ this.AT_LEAST_ONE(() => {
+ this.OR(this.anyOf(TokensDoubleQuotedExpression, 'Content'));
+ });
+ });
+
+ this.singleQuotedExpressionTokens = this.RULE('singleQuotedExpressionTokens', () => {
+ // Contents in '...{{ ... }}...' but outside the {{ }}
+ // Must be non-empty. Can have leading/trailing whitespace.
+ this.AT_LEAST_ONE(() => {
+ this.OR(this.anyOf(TokensSingleQuotedExpression, 'Content'));
+ });
+ });
+
+ this.betweenTagsTokens = this.RULE('betweenTagsTokens', () => {
+ // Plain texts within tags but outside nested tags. Must be non-empty.
+ this.AT_LEAST_ONE(() => {
+ this.OR(this.anyOf(TokensTextContent, 'Content'));
+ });
+ });
+
+ this.literalTagTokens = this.RULE('literalTagTokens', (expectedTagName?: string) => {
+ // Plain texts within literal tags like ....
+ // Match greedily. Can be empty.
+ this.AT_LEAST_ONE({
+ GATE: () => !this.isAtLiteralClose(expectedTagName),
+ DEF: () => {
+ this.OR(this.anyOf(AllTokens, 'Content'));
+ },
+ });
+ });
+
+ // ----- Main rules -----
+
+ this.template = this.RULE('template', () => {
+ this.CONSUME(TemplateOpen);
+ this.OPTION(() => this.CONSUME(Whitespace, { LABEL: 'WsAfterOpen' }));
+ this.SUBRULE(this.expressionTokens, { LABEL: 'Content' });
+ this.OPTION2(() => this.CONSUME2(Whitespace, { LABEL: 'WsAfterContent' }));
+ this.CONSUME2(TemplateClose);
+ });
+
+ this.comment = this.RULE('comment', () => {
+ this.CONSUME(CommentOpen);
+ // anything until -->
+ this.SUBRULE(this.commentTokens, { LABEL: 'Content' });
+ this.CONSUME(CommentClose);
+ });
+
+ this.pragma = this.RULE('pragma', () => {
+ this.CONSUME(CommentOpen);
+ this.OPTION(() => this.CONSUME(Whitespace, { LABEL: 'WsAfterOpen' }));
+ this.CONSUME(PragmaKeyword);
+ this.OPTION2(() => this.CONSUME2(Whitespace, { LABEL: 'WsAfterPragma' }));
+
+ // identifier after @pragma
+ this.CONSUME(Identifier, { LABEL: 'PragmaIdentifier' });
+
+ // Options: unquoted tokens or quoted strings (no templates inside these)
+ this.MANY(() => {
+ this.CONSUME3(Whitespace, { LABEL: 'WsBeforeEachOption' });
+ this.OR([
+ {
+ // Try quoted options first
+ ALT: () => this.SUBRULE(this.quoted, { LABEL: 'PragmaOption' }),
+ },
+ {
+ // Then try identifier tokens (can include +, -, etc.)
+ GATE: () => this.LA(1).tokenType !== SingleQuote && this.LA(1).tokenType !== DoubleQuote,
+ ALT: () => this.SUBRULE2(this.commentIdentifierTokens, { LABEL: 'PragmaOption' }),
+ },
+ ]);
+ });
+
+ this.OPTION3(() => this.CONSUME4(Whitespace, { LABEL: 'WsAfterAll' }));
+
+ this.CONSUME(CommentClose);
+ });
+
+ this.quoted = this.RULE('quoted', () => {
+ this.OR([
+ {
+ ALT: () => {
+ this.CONSUME(DoubleQuote, { LABEL: 'OpenQuote' });
+ this.SUBRULE(this.doubleQuotedTokens, { LABEL: 'Content' });
+ this.CONSUME2(DoubleQuote, { LABEL: 'CloseQuote' });
+ },
+ },
+ {
+ ALT: () => {
+ this.CONSUME(SingleQuote, { LABEL: 'OpenQuote' });
+ this.SUBRULE(this.singleQuotedTokens, { LABEL: 'Content' });
+ this.CONSUME2(SingleQuote, { LABEL: 'CloseQuote' });
+ },
+ },
+ ]);
+ });
+
+ this.quotedTemplate = this.RULE('quotedTemplate', () => {
+ this.OR([
+ {
+ ALT: () => {
+ this.CONSUME(DoubleQuote, { LABEL: 'OpenQuote' });
+ this.MANY(() => {
+ this.OR2([
+ { ALT: () => this.SUBRULE(this.template, { LABEL: 'Content' }) },
+ {
+ ALT: () => this.SUBRULE2(this.doubleQuotedExpressionTokens, { LABEL: 'Content' }),
+ },
+ ]);
+ });
+ this.CONSUME2(DoubleQuote, { LABEL: 'CloseQuote' });
+ },
+ },
+ {
+ ALT: () => {
+ this.CONSUME(SingleQuote, { LABEL: 'OpenQuote' });
+ this.MANY2(() => {
+ this.OR3([
+ { ALT: () => this.SUBRULE3(this.template, { LABEL: 'Content' }) },
+ {
+ ALT: () => this.SUBRULE4(this.singleQuotedExpressionTokens, { LABEL: 'Content' }),
+ },
+ ]);
+ });
+ this.CONSUME2(SingleQuote, { LABEL: 'CloseQuote' });
+ },
+ },
+ ]);
+ });
+
+ this.forIteratorValue = this.RULE('forIteratorValue', () => {
+ this.OR([
+ {
+ ALT: () => {
+ this.CONSUME(DoubleQuote, { LABEL: 'OpenQuote' });
+ this.OPTION(() => this.CONSUME(Whitespace, { LABEL: 'WsAfterOpen' }));
+ this.CONSUME(Identifier, { LABEL: 'Iterator' });
+ this.CONSUME2(Whitespace, { LABEL: 'WsAfterIterator' });
+ this.CONSUME2(Identifier, { LABEL: 'InKeyword' });
+ this.CONSUME3(Whitespace, { LABEL: 'WsAfterIn' });
+ // Greedily match until the next unescaped quote
+ this.SUBRULE(this.doubleQuotedTrimmedTokens, { LABEL: 'Collection' });
+ this.OPTION2(() => this.CONSUME4(Whitespace, { LABEL: 'WsAfterCollection' }));
+ this.CONSUME2(DoubleQuote, { LABEL: 'CloseQuote' });
+ },
+ },
+ {
+ ALT: () => {
+ this.CONSUME(SingleQuote, { LABEL: 'OpenQuote' });
+ this.OPTION3(() => this.CONSUME5(Whitespace, { LABEL: 'WsAfterOpen' }));
+ this.CONSUME3(Identifier, { LABEL: 'Iterator' });
+ this.CONSUME6(Whitespace, { LABEL: 'WsAfterIterator' });
+ this.CONSUME4(Identifier, { LABEL: 'InKeyword' });
+ this.CONSUME7(Whitespace, { LABEL: 'WsAfterIn' });
+ // Greedily match until the next unescaped quote
+ this.SUBRULE(this.singleQuotedTrimmedTokens, { LABEL: 'Collection' });
+ this.OPTION4(() => this.CONSUME8(Whitespace, { LABEL: 'WsAfterCollection' }));
+ this.CONSUME2(SingleQuote, { LABEL: 'CloseQuote' });
+ },
+ },
+ ]);
+ });
+
+ this.attribute = this.RULE('attribute', () => {
+ const keyTok = this.CONSUME(Identifier, { LABEL: 'AttributeKey' });
+ this.OPTION(() => this.CONSUME(Whitespace, { LABEL: 'WsAfterKey' }));
+ this.CONSUME(Equals); // label not needed; token name matches
+ this.OPTION2(() => this.CONSUME2(Whitespace, { LABEL: 'WsAfterEquals' }));
+
+ this.OR([
+ // for="..."
+ {
+ GATE: () =>
+ keyTok.image?.toLowerCase() === 'for' &&
+ (this.LA(1).tokenType === DoubleQuote || this.LA(1).tokenType === SingleQuote),
+ ALT: () => this.SUBRULE(this.forIteratorValue, { LABEL: 'forIteratorValue' }),
+ },
+ // templatedValue: {{ ... }}
+ {
+ GATE: () => this.LA(1).tokenType === TemplateOpen,
+ ALT: () => this.SUBRULE(this.template, { LABEL: 'templatedValue' }),
+ },
+ // quotedValue: "..."/'...' (may contain templates)
+ { ALT: () => this.SUBRULE(this.quotedTemplate, { LABEL: 'quotedValue' }) },
+ ]);
+ });
+
+ this.openTagPartial = this.RULE('openTagPartial', () => {
+ this.CONSUME(OpenBracket);
+ this.OPTION(() => this.CONSUME(Whitespace, { LABEL: 'WsAfterOpen' }));
+ const tagTok = this.CONSUME(Identifier, { LABEL: 'TagName' });
+ this.MANY(() => {
+ this.CONSUME2(Whitespace, { LABEL: 'WsBeforeEachAttribute' });
+ this.SUBRULE(this.attribute, { LABEL: 'Attribute' });
+ });
+ this.OPTION2(() => this.CONSUME3(Whitespace, { LABEL: 'WsAfterAll' }));
+ });
+
+ this.closeTag = this.RULE('closeTag', () => {
+ this.CONSUME(ClosingOpenBracket);
+ this.OPTION(() => this.CONSUME(Whitespace, { LABEL: 'WsAfterOpen' }));
+ this.CONSUME(Identifier, { LABEL: 'TagName' });
+ this.OPTION2(() => this.CONSUME2(Whitespace, { LABEL: 'WsBeforeClose' }));
+ this.CONSUME(CloseBracket);
+ });
+
+ this.element = this.RULE('element', () => {
+ const tagName = this.peekTagName();
+ this.OR([
+ {
+ GATE: () => this.literalTagNames.has(tagName?.toLowerCase() || ''),
+ ALT: () => {
+ this.SUBRULE(this.openTagPartial, { LABEL: 'OpenTagPartial' });
+ // Literal element logic - must have closing tag, no self-close
+ this.CONSUME(CloseBracket, { LABEL: 'OpenTagCloseBracket' });
+
+ // Everything until the matching close tag is treated as raw text
+ // We impose a stricter check when enclosing a literal tag, and avoid false negative matches
+ // This will make some auto completion scenarios fail, but it will enhance the inclusiveness of literal elements
+ this.SUBRULE(this.literalTagTokens, { ARGS: [tagName], LABEL: 'TextContent' });
+
+ this.SUBRULE(this.closeTag, { LABEL: 'CloseTag' });
+ },
+ },
+ {
+ GATE: () => tagName === undefined || !this.literalTagNames.has(tagName?.toLowerCase()),
+ ALT: () => {
+ this.SUBRULE2(this.openTagPartial, { LABEL: 'OpenTagPartial' });
+ this.OR2([
+ {
+ ALT: () => {
+ this.CONSUME2(CloseBracket, { LABEL: 'OpenTagCloseBracket' });
+ this.MANY(() => {
+ this.SUBRULE(this.elementContent, { LABEL: 'Content' });
+ });
+ this.SUBRULE2(this.closeTag, { LABEL: 'CloseTag' });
+ },
+ },
+ {
+ ALT: () => {
+ // Self-closing tag - no content, no closing tag
+ this.CONSUME(SelfCloseBracket, { LABEL: 'SelfCloseBracket' });
+ },
+ },
+ ]);
+ },
+ },
+ ]);
+ });
+
+ this.performSelfAnalysis();
+ }
+
+ public parseRoot(): CstNode {
+ // Invoke the entry rule (property is a function)
+ return this.root();
+ }
+}
+
+// Singleton parser
+export const extendedPomlParser = new ExtendedPomlParser();
+
+export function parsePomlToCst(input: string): {
+ cst: CstNode | undefined;
+ lexErrors: ReturnType['errors'];
+ parseErrors: typeof extendedPomlParser.errors;
+} {
+ const lex = extendedPomlLexer.tokenize(input);
+ extendedPomlParser.input = lex.tokens;
+ const cst = extendedPomlParser.parseRoot();
+ return {
+ cst,
+ lexErrors: lex.errors,
+ parseErrors: extendedPomlParser.errors,
+ };
+}
diff --git a/packages/poml/next/diagnostics.ts b/packages/poml/next/diagnostics.ts
new file mode 100644
index 00000000..75090e53
--- /dev/null
+++ b/packages/poml/next/diagnostics.ts
@@ -0,0 +1,457 @@
+import * as path from 'path';
+import chalk from 'chalk';
+import { Diagnostic, Range, Severity } from './types';
+import sourceManager from './source';
+
+interface FormatOptions {
+ showWarnings?: boolean;
+ showInfo?: boolean;
+ groupByFile?: boolean;
+}
+
+/**
+ * Global Error Collector.
+ *
+ * Goals:
+ *
+ * 1. Centralized singleton that collects errors from anywhere in the codebase
+ * 2. Support for error types (error/warning), source locations (file, line, column, index ranges), and contextual data
+ * 3. Handle errors from embedded languages (JSON, JS expressions) with source mapping back to original positions
+ * 4. Track errors across multiple source files without conflicts
+ * 5. Collect multiple errors without stopping execution
+ * 6. Clear errors between compilation runs or test cases
+ * 7. Generate human-readable, formatted error messages with source context
+ */
+export class ErrorCollector {
+ private diagnostics: Diagnostic[] = [];
+ private suppressedCodes = new Set();
+ private maxErrors = 100;
+
+ /**
+ * Clear all collected errors
+ */
+ public clear(): void {
+ this.diagnostics = [];
+ }
+
+ /**
+ * Post an error
+ */
+ public error(message: string, range?: Range, options: Partial = {}): void {
+ this.add({
+ ...options,
+ severity: Severity.ERROR,
+ message,
+ range,
+ sourceFile: options.sourceFile || sourceManager.getCurrentFile(),
+ });
+ }
+
+ /**
+ * Post a warning
+ */
+ public warning(message: string, range?: Range, options: Partial = {}): void {
+ this.add({
+ ...options,
+ severity: Severity.WARNING,
+ message,
+ range,
+ sourceFile: options.sourceFile || sourceManager.getCurrentFile(),
+ });
+ }
+
+ /**
+ * Post an info message
+ */
+ public info(message: string, range?: Range, options: Partial = {}): void {
+ this.add({
+ ...options,
+ severity: Severity.INFO,
+ message,
+ range,
+ sourceFile: options.sourceFile || sourceManager.getCurrentFile(),
+ });
+ }
+
+ /**
+ * Add a diagnostic
+ */
+ public add(diagnostic: Diagnostic): void {
+ // Check error limit
+ if (this.diagnostics.length >= this.maxErrors) {
+ if (this.diagnostics.length === this.maxErrors) {
+ this.diagnostics.push({
+ severity: Severity.ERROR,
+ message: `Error limit reached (${this.maxErrors}). Further errors suppressed.`,
+ });
+ }
+ return;
+ }
+
+ // Skip suppressed error codes
+ if (diagnostic.code && this.suppressedCodes.has(diagnostic.code)) {
+ return;
+ }
+
+ // Add current file if not specified
+ if (!diagnostic.sourceFile && sourceManager.getCurrentFile()) {
+ diagnostic.sourceFile = sourceManager.getCurrentFile();
+ }
+
+ this.diagnostics.push(diagnostic);
+ }
+
+ /**
+ * Post a JSON parsing error with automatic position mapping
+ */
+ public jsonError(originalError: Error, jsonRange: Range): void {
+ // Extract position from JSON parse error if available
+ const posMatch = originalError.message.match(/position (\d+)/);
+ let range = jsonRange;
+
+ if (posMatch) {
+ const errorPos = parseInt(posMatch[1]);
+ // Map the JSON error position to the original source
+ range = {
+ start: jsonRange.start + errorPos,
+ end: jsonRange.start + errorPos + 1,
+ };
+ }
+
+ this.error(`JSON parsing error: ${originalError.message}`, range, {
+ code: 'JSON_PARSE_ERROR',
+ originalError,
+ hint: 'Check for trailing commas, unquoted keys, or undefined values',
+ });
+ }
+
+ /**
+ * Post a JavaScript expression evaluation error
+ */
+ public expressionError(originalError: Error, expressionRange: Range, evalHeaderLength: number = 0): void {
+ // Adjust range if there's a header (like "return " or "const result = ")
+ const adjustedRange =
+ evalHeaderLength > 0
+ ? {
+ start: expressionRange.start + evalHeaderLength,
+ end: expressionRange.end,
+ }
+ : expressionRange;
+
+ // Try to extract line/column from error stack
+ const stackMatch = originalError.stack?.match(/:(\d+):(\d+)/);
+ let range = adjustedRange;
+
+ if (stackMatch) {
+ const errorLine = parseInt(stackMatch[1]);
+ const errorCol = parseInt(stackMatch[2]);
+
+ // If we have line/column info, try to be more precise
+ const currentFileContent = sourceManager.getCurrentFileContent();
+ if (currentFileContent) {
+ const exprContent = currentFileContent.substring(expressionRange.start, expressionRange.end);
+ const lines = exprContent.split('\n');
+
+ if (errorLine <= lines.length) {
+ let offset = expressionRange.start;
+ for (let i = 0; i < errorLine - 1; i++) {
+ offset += lines[i].length + 1; // +1 for newline
+ }
+ offset += Math.min(errorCol - 1, lines[errorLine - 1].length);
+
+ range = {
+ start: offset,
+ end: offset + 1,
+ };
+ }
+ }
+ }
+
+ this.error(`Expression evaluation failed: ${originalError.message}`, range, {
+ code: 'EXPRESSION_ERROR',
+ originalError,
+ hint: 'Check variable names and syntax in the expression',
+ });
+ }
+
+ /**
+ * Suppress errors with specific codes
+ */
+ public suppressCode(code: string): void {
+ this.suppressedCodes.add(code);
+ }
+
+ /**
+ * Format a single diagnostic for CLI output
+ */
+ private formatDiagnostic(diagnostic: Diagnostic): string {
+ const parts: string[] = [];
+
+ // Severity and code
+ const severityColor = {
+ [Severity.ERROR]: chalk.red,
+ [Severity.WARNING]: chalk.yellow,
+ [Severity.INFO]: chalk.blue,
+ }[diagnostic.severity];
+
+ let header = severityColor(diagnostic.severity.toUpperCase());
+
+ if (diagnostic.code) {
+ header += chalk.gray(` [${diagnostic.code}]`);
+ }
+
+ // File location
+ if (diagnostic.sourceFile) {
+ const source = sourceManager.loadSource(diagnostic.sourceFile);
+
+ if (source && diagnostic.range) {
+ const startPos = sourceManager.indexToPosition(source, diagnostic.range.start);
+ const location = `${diagnostic.sourceFile}:${startPos.line}:${startPos.column}`;
+ header += ` ${chalk.cyan(location)}`;
+ } else {
+ header += ` ${chalk.cyan(diagnostic.sourceFile)}`;
+ }
+ }
+
+ parts.push(header);
+
+ // Message
+ parts.push(` ${diagnostic.message}`);
+
+ // Source context
+ if (diagnostic.sourceFile && diagnostic.range) {
+ const source = sourceManager.loadSource(diagnostic.sourceFile);
+
+ if (source) {
+ const startPos = sourceManager.indexToPosition(source, diagnostic.range.start);
+ const endPos = sourceManager.indexToPosition(source, diagnostic.range.end);
+
+ // Show context lines
+ const contextLines = 2;
+ const startLine = Math.max(0, startPos.line - contextLines - 1);
+ const endLine = Math.min(source.lines.length - 1, startPos.line + contextLines - 1);
+
+ parts.push('');
+
+ for (let i = startLine; i <= endLine; i++) {
+ const lineNum = String(i + 1).padStart(4, ' ');
+ const isErrorLine = i === startPos.line - 1;
+ const pipe = isErrorLine ? '>' : '|';
+ const lineColor = isErrorLine ? chalk.white : chalk.gray;
+
+ parts.push(chalk.gray(` ${lineNum} ${pipe}`) + ' ' + lineColor(source.lines[i]));
+
+ // Add error underline
+ if (isErrorLine) {
+ const spacing = ' '.repeat(startPos.column - 1 + 7);
+ let markerLength = 1;
+
+ if (startPos.line === endPos.line) {
+ markerLength = Math.max(1, endPos.column - startPos.column);
+ } else {
+ markerLength = source.lines[i].length - startPos.column + 1;
+ }
+
+ const marker = '^'.repeat(Math.min(markerLength, 80));
+ parts.push(severityColor(spacing + marker));
+ }
+ }
+ }
+ }
+
+ // Hint
+ if (diagnostic.hint) {
+ parts.push('');
+ parts.push(chalk.green(` 💡 ${diagnostic.hint}`));
+ }
+
+ return parts.join('\n');
+ }
+
+ /**
+ * Get all errors
+ */
+ public getErrors(): Diagnostic[] {
+ return this.diagnostics.filter((d) => d.severity === Severity.ERROR);
+ }
+
+ /**
+ * Get all warnings
+ */
+ public getWarnings(): Diagnostic[] {
+ return this.diagnostics.filter((d) => d.severity === Severity.WARNING);
+ }
+
+ /**
+ * Check if there are any errors
+ */
+ public hasErrors(): boolean {
+ return this.getErrors().length > 0;
+ }
+
+ /**
+ * Get count by severity
+ */
+ public getCounts(): { errors: number; warnings: number; info: number } {
+ const counts = { errors: 0, warnings: 0, info: 0 };
+
+ for (const d of this.diagnostics) {
+ switch (d.severity) {
+ case Severity.ERROR:
+ counts.errors++;
+ break;
+ case Severity.WARNING:
+ counts.warnings++;
+ break;
+ case Severity.INFO:
+ counts.info++;
+ break;
+ }
+ }
+
+ return counts;
+ }
+
+ /**
+ * Format all diagnostics for CLI output
+ */
+ public format(options?: FormatOptions): string {
+ const { showWarnings = true, showInfo = false, groupByFile = true } = options ?? {};
+
+ const filtered = this.diagnostics.filter((d) => {
+ if (d.severity === Severity.ERROR) {
+ return true;
+ }
+ if (d.severity === Severity.WARNING) {
+ return showWarnings;
+ }
+ if (d.severity === Severity.INFO) {
+ return showInfo;
+ }
+ return false;
+ });
+
+ if (filtered.length === 0) {
+ return chalk.green('✓ No issues found');
+ }
+
+ const output: string[] = [];
+
+ if (groupByFile) {
+ // Group by file
+ const byFile = new Map();
+ const noFile: Diagnostic[] = [];
+
+ for (const d of filtered) {
+ if (d.sourceFile) {
+ if (!byFile.has(d.sourceFile)) {
+ byFile.set(d.sourceFile, []);
+ }
+ byFile.get(d.sourceFile)!.push(d);
+ } else {
+ noFile.push(d);
+ }
+ }
+
+ // Sort files
+ const sortedFiles = Array.from(byFile.keys()).sort();
+
+ for (const file of sortedFiles) {
+ output.push(chalk.underline.bold(path.relative(process.cwd(), file)));
+ output.push('');
+
+ const diagnostics = byFile.get(file)!.sort((a, b) => {
+ if (!a.range || !b.range) {
+ return 0;
+ }
+ return a.range.start - b.range.start;
+ });
+
+ for (const d of diagnostics) {
+ output.push(this.formatDiagnostic(d));
+ output.push('');
+ }
+ }
+
+ // Add diagnostics without file
+ if (noFile.length > 0) {
+ output.push(chalk.underline.bold('General'));
+ output.push('');
+ for (const d of noFile) {
+ output.push(this.formatDiagnostic(d));
+ output.push('');
+ }
+ }
+ } else {
+ // Simple list
+ for (const d of filtered) {
+ output.push(this.formatDiagnostic(d));
+ output.push('');
+ }
+ }
+
+ // Summary
+ const counts = this.getCounts();
+ const summary: string[] = [];
+
+ if (counts.errors > 0) {
+ summary.push(chalk.red(`${counts.errors} error${counts.errors !== 1 ? 's' : ''}`));
+ }
+ if (counts.warnings > 0 && showWarnings) {
+ summary.push(chalk.yellow(`${counts.warnings} warning${counts.warnings !== 1 ? 's' : ''}`));
+ }
+ if (counts.info > 0 && showInfo) {
+ summary.push(chalk.blue(`${counts.info} info`));
+ }
+
+ output.push(chalk.bold(`Found ${summary.join(', ')}`));
+
+ return output.join('\n');
+ }
+
+ /**
+ * Print formatted errors to console
+ */
+ public print(options?: FormatOptions): void {
+ console.log(this.format(options));
+ }
+
+ /**
+ * Get all diagnostics
+ */
+ public getDiagnostics(): ReadonlyArray {
+ return this.diagnostics;
+ }
+}
+
+// Create singleton instance
+let errorCollector: ErrorCollector | undefined = undefined;
+
+export function getErrorCollector(): ErrorCollector {
+ if (!errorCollector) {
+ errorCollector = new ErrorCollector();
+ }
+ return errorCollector;
+}
+
+// Convenience export
+
+export const clear = () => getErrorCollector().clear();
+export const error = (message: string, range?: Range, options: Partial = {}) =>
+ getErrorCollector().error(message, range, options);
+export const warning = (message: string, range?: Range, options: Partial = {}) =>
+ getErrorCollector().warning(message, range, options);
+export const info = (message: string, range?: Range, options: Partial = {}) =>
+ getErrorCollector().info(message, range, options);
+export const jsonError = (originalError: Error, jsonRange: Range) =>
+ getErrorCollector().jsonError(originalError, jsonRange);
+export const expressionError = (originalError: Error, expressionRange: Range, evalHeaderLength: number = 0) =>
+ getErrorCollector().expressionError(originalError, expressionRange, evalHeaderLength);
+export const suppressCode = (code: string) => getErrorCollector().suppressCode(code);
+export const hasErrors = () => getErrorCollector().hasErrors();
+export const getErrors = () => getErrorCollector().getErrors();
+export const getWarnings = () => getErrorCollector().getWarnings();
+export const getCounts = () => getErrorCollector().getCounts();
+export const format = (options?: FormatOptions) => getErrorCollector().format(options);
+export const print = (options?: FormatOptions) => getErrorCollector().print(options);
+export const getDiagnostics = () => getErrorCollector().getDiagnostics();
diff --git a/packages/poml/next/lexer.ts b/packages/poml/next/lexer.ts
new file mode 100644
index 00000000..5cd22c13
--- /dev/null
+++ b/packages/poml/next/lexer.ts
@@ -0,0 +1,167 @@
+import { createToken, Lexer } from 'chevrotain';
+
+// Define token types for extended POML
+export const CommentOpen = createToken({ name: 'CommentOpen', pattern: /
+ * - starts or ends a template: {{, }}
+ * - starts or ends a string literal: " or '
+ * - whitespace (handled separately - includes control chars)
+ * - equal sign (=)
+ * - backslash \ (handled separately for escaping)
+ * - valid backslash escape sequences such as \n, \t, \", \', \\, \xHH, \uHHHH, \UHHHHHHHH, \{{, \}}
+ * - character entities such as { or &name;
+ *
+ * Allowed:
+ * - Single { or } are OK if they are not followed by another brace
+ * - Incomplete tag delimiters such as / (/< is an exception, because < is a start of tag)
+ * - Incomplete comment delimiters such as !-- or -- are OK
+ * - Incorrect @pragma directive such as @pragm or @pragmaX will be matched
+ * - Invalid character entities such as &abc (without semicolon) or & (by itself)
+ * - All other Unicode characters including emojis, CJK, etc.
+ */
+export const Arbitrary = createToken({
+ name: 'Arbitrary',
+ // Match anything except: <, >, quotes, =, backslash, whitespace, control chars
+ // Allow single braces and slashes with lookahead constraints
+ pattern:
+ /(?:[^<>"'{}=\\& \t\r\n\v\f/-]|{(?!{)|}(?!})|\/(?!>)|\-(?!\-+>)|&(?!#\d+;|x[0-9A-Fa-f]+;|[a-zA-Z][a-zA-Z0-9]*;|;))+/,
+ line_breaks: false,
+});
+
+// Define token order - more specific patterns first
+export const AllTokens = [
+ CommentOpen,
+ CommentClose,
+ PragmaKeyword,
+ TemplateOpen,
+ TemplateClose,
+ ClosingOpenBracket, // Must come before OpenBracket
+ SelfCloseBracket, // Must come before CloseBracket
+ OpenBracket,
+ CloseBracket,
+ Equals,
+ DoubleQuote,
+ SingleQuote,
+ BackslashEscape,
+ Backslash,
+ CharacterEntity,
+ Identifier,
+ Whitespace,
+ Arbitrary,
+];
+
+export const XmlBracketTokens = [
+ CommentOpen,
+ CommentClose,
+ ClosingOpenBracket,
+ SelfCloseBracket,
+ OpenBracket,
+ CloseBracket,
+];
+
+export const TokensComment = AllTokens.filter((tokenType) => tokenType !== CommentClose);
+// Tokens used in comment, but disallow whitespace, used in @pragma as "identifiers".
+export const TokensCommentIdentifiers = TokensComment.filter((tokenType) => tokenType !== Whitespace);
+
+// Tokens used in expressions (inside {{ and }}), excluding the closing braces.
+// Opening braces {{ should work, but they should be generally properly escaped inside to avoid confusion.
+export const TokensExpression = AllTokens.filter((tokenType) => tokenType !== TemplateClose);
+
+// Tokens used in quotes. The quoted strings do not allow template expressions inside.
+// The only application currently is in @pragma directive options.
+// Quoted strings can contain backslash escapes. Character entities will be however shown as is.
+export const TokensDoubleQuoted = AllTokens.filter((tokenType) => tokenType !== DoubleQuote);
+export const TokensSingleQuoted = AllTokens.filter((tokenType) => tokenType !== SingleQuote);
+
+// Tokens used in quotes, but within quotes distinguish from expressions (surrounded by {{ and }}).
+export const TokensDoubleQuotedExpression = AllTokens.filter(
+ (tokenType) => tokenType !== DoubleQuote && tokenType !== TemplateOpen,
+);
+export const TokensSingleQuotedExpression = AllTokens.filter(
+ (tokenType) => tokenType !== SingleQuote && tokenType !== TemplateOpen,
+);
+
+// Text contents inside XML elements.
+// Like XML/HTML, the contents here can have `&` XML entities to escape special characters.
+// Escaped characters via backslash will be shown as is without escape handling.
+export const TokensTextContent = AllTokens.filter(
+ (tokenType) => !XmlBracketTokens.includes(tokenType) && tokenType !== TemplateOpen,
+);
+
+// Extended POML Lexer class
+export class ExtendedPomlLexer {
+ private lexer: Lexer;
+
+ constructor() {
+ this.lexer = new Lexer(AllTokens);
+ }
+
+ public tokenize(text: string) {
+ const lexingResult = this.lexer.tokenize(text);
+
+ if (lexingResult.errors.length > 0) {
+ console.warn('Lexing errors:', lexingResult.errors);
+ }
+
+ return {
+ tokens: lexingResult.tokens,
+ errors: lexingResult.errors,
+ groups: lexingResult.groups,
+ };
+ }
+}
+
+// Create a single instance to export
+export const extendedPomlLexer = new ExtendedPomlLexer();
+
+// Export token types for use in parser
+export type { IToken, ILexingError, ILexingResult } from 'chevrotain';
diff --git a/packages/poml/next/nodes.ts b/packages/poml/next/nodes.ts
new file mode 100644
index 00000000..4673ae43
--- /dev/null
+++ b/packages/poml/next/nodes.ts
@@ -0,0 +1,479 @@
+import { Range } from './types';
+import { CstNode, IToken } from 'chevrotain';
+
+export interface AstNode {
+ range: Range; // start and end offsets in the source text
+}
+
+/**
+ * Plain token sequences helpers from the lexer.
+ */
+export interface CstTokens extends CstNode {
+ children: {
+ Content?: IToken[];
+ };
+}
+
+/**
+ * Represents a template interpolation with double curly braces,
+ * or sometimes without braces in specific attributes.
+ *
+ * Template nodes handle variable interpolation in POML, containing an
+ * expression that will be evaluated and substituted at runtime. The node
+ * preserves the template syntax for proper rendering and error reporting.
+ *
+ * Cases that apply:
+ * - Standalone template variables: `{{ userName }}`, `{{ count + 1 }}`
+ * - Template expressions in text: part of "Hello {{ name }}!"
+ * - Complex expressions: `{{ users.map(u => u.name).join(", ") }}`
+ * - Conditional rendering: `{{ isVisible ? "Show" : "Hide" }}`
+ * - Template usage in if attributes: `condition` in `if="condition"`
+ *
+ * Cases that do not apply:
+ * - Full attribute expressions: `if="x > 0"` (use AttributeNode)
+ * - Plain text: `Hello World` (use LiteralNode)
+ * - Single braces: `{ not a template }` (treated as plain text)
+ * - Template elements: {{ this is a jinja template }} (use LiteralNode)
+ * - With quotes: `"{{ var }}"` (use ValueNode)
+ */
+export interface TemplateNode extends AstNode {
+ kind: 'TEMPLATE';
+ value: LiteralNode;
+}
+
+/**
+ * Related CST node interfaces for parsing stage.
+ */
+
+export interface CstTemplateNode extends CstNode {
+ children: {
+ TemplateOpen?: IToken[];
+ WsAfterOpen?: IToken[];
+ // Content inside {{ and }} is treated as a single expression token.
+ // Eats everything until the next }} (or the whitespace before it).
+ // Handles \{{ and \}} escapes. We won't escape other chars here.
+ Content?: CstTokens[];
+ // If it's close to the ending }}, try to eat whitespace before it.
+ WsAfterContent?: IToken[];
+ TemplateClose?: IToken[];
+ };
+}
+
+/**
+ * Represents plain text content without any special syntax.
+ *
+ * Literal nodes are the most basic content nodes, containing literal text
+ * that requires no processing. They are used both for content and as
+ * components of other nodes (like attribute keys and tag names).
+ *
+ * Cases that apply:
+ * - Plain text content: `Hello World`, `This is a paragraph`
+ * - Long text blocks in `` elements: `some long text continued`
+ * - Attribute keys: the `class` in `class="container"`
+ * - Tag names: the `div` in ``
+ * - Identifiers: variable names like `item` in for loops
+ * - Whitespace and formatting text between elements
+ * - Expressions: `x > 0` (use ExpressionNode)
+ *
+ * Cases that do not apply:
+ * - Text containing templates: `Hello {{ name }}` (use ValueNode with children)
+ * - Quoted strings in attributes: `"value"` (use ValueNode)
+ * - Template variables: `{{ var }}` (use TemplateNode)
+ */
+export interface LiteralNode extends AstNode {
+ kind: 'STRING';
+ value: string;
+}
+
+/**
+ * The value of an attribute, which may contain text and/or templates.
+ * Used specifically for the "quotes" in attribute values.
+ *
+ * Value nodes are containers for mixed content, handling both pure text
+ * and interpolated templates. They preserve quote information when used
+ * as attribute values and support complex content composition.
+ *
+ * Cases that apply:
+ * - Quoted attribute values: `"some text"`, `'single quoted'`
+ * - Mixed content with templates: `"Hello, {{ userName }}!"`
+ * - Unquoted template values in certain attribute contexts (e.g., if="condition_expr")
+ * - Multi-part content: `"Price: ${{amount}} USD"`
+ *
+ * Cases that do not apply:
+ * - Attribute keys: `class=...` (the `class` part uses LiteralNode)
+ * - Pure expressions without quotes: `if=condition` (illegal)
+ * - Mixture of template and non-templates in element contents (use LiteralNode and TemplateNode directly)
+ *
+ * Note: The range includes quotes if present, but children exclude them.
+ */
+export interface ValueNode extends AstNode {
+ kind: 'VALUE';
+ children: (LiteralNode | TemplateNode)[];
+}
+
+/**
+ * Related CST node interfaces for parsing stage.
+ * The following two interfaces are for quoted strings and will be transformed into ValueNode.
+ */
+export interface CstQuotedNode extends CstNode {
+ children: {
+ OpenQuote?: IToken[];
+ // This is a normal quoted string without templates inside.
+ Content?: CstTokens[];
+ CloseQuote?: IToken[];
+ };
+}
+
+export interface CstQuotedTemplateNode extends CstNode {
+ children: {
+ OpenQuote?: IToken[];
+ // Allows "Hello {{ friend["abc"] }}!" - mix of text and templates (with quotes).
+ Content?: (CstTokens | CstTemplateNode)[];
+ CloseQuote?: IToken[];
+ };
+}
+
+/**
+ * Represents a for-loop iteration construct in POML.
+ *
+ * For loops enable iterative rendering of elements, following the pattern
+ * "iterator in collection". This node captures both the loop variable
+ * and the collection expression for runtime evaluation.
+ *
+ * Cases that apply:
+ * - Simple iteration: `"item in items"`
+ * - Property access: `"user in data.users"`
+ * - Array literals: `"num in [1, 2, 3]"`
+ * - Method calls in single quotes: `'result in getResults()'`
+ * - Nested property iteration: `'task in project.tasks.active'`
+ *
+ * Cases that do not apply (not yet supported):
+ * - Without quotes: `item in items` (must be in quotes for now)
+ * - Advanced loop syntax (not yet supported): `(item, index) in items`
+ * - Destructuring patterns (not yet supported): `{name, age} in users`
+ * - Conditional loops: `if` attributes (use separate condition handling)
+ * - Template interpolation: `{{ items }}` (use TemplateNode)
+ */
+export interface ForIteratorNode extends AstNode {
+ kind: 'FORITERATOR';
+ iterator: LiteralNode;
+ collection: LiteralNode;
+}
+
+/**
+ * Related CST node interfaces for parsing stage.
+ */
+export interface CstForIteratorNode extends CstNode {
+ children: {
+ OpenQuote?: IToken[];
+ WsAfterOpen?: IToken[];
+ Iterator?: IToken[];
+ WsAfterIterator?: IToken[];
+ InKeyword?: IToken[];
+ WsAfterIn?: IToken[];
+ // Follows the same parsing rules as template expression.
+ // But as we are in a quoted string, we need to handle
+ // backslash escapes like \" and \'.
+ // Greedily match until the next unescaped quote or ws before it.
+ Collection?: CstTokens[];
+ WsAfterCollection?: IToken[];
+ CloseQuote?: IToken[];
+ };
+}
+
+/**
+ * Represents a standard attribute on a POML element.
+ *
+ * Attributes provide metadata and configuration for elements. They consist
+ * of a key-value pair where the key is always a simple string and the value
+ * can be a complex composition of text and templates.
+ *
+ * It also supports for-loop attributes via ForIterator, which contains
+ * loop iteration syntax rather than a simple value. It enables
+ * elements to be rendered multiple times based on a collection.
+ *
+ * Cases that apply:
+ * - Simple attributes: `class="container"`, `id='main'`
+ * - Template values: `title="{{ pageTitle }}"` or `title={{ pageTitle }}`
+ * - Mixed values: `placeholder="Enter {{ fieldName }}..."`
+ * - For attributes: `for="item in items"` (key is "for", value is ForIteratorNode)
+ * - Computed collections: `for='i in [...Array(5).keys()]'`
+ *
+ * Cases that do not apply:
+ * - Boolean/presence attributes: `disabled`, `checked` (not yet supported)
+ * - Spread attributes (not yet supported): `{...props}`
+ * - Dynamic attribute names (not supported): `[attrName]="value"`
+ */
+export interface AttributeNode extends AstNode {
+ kind: 'ATTRIBUTE';
+ key: LiteralNode;
+ value: ValueNode | ForIteratorNode;
+}
+
+/**
+ * Related CST node interfaces for parsing stage.
+ */
+export interface CstAttributeNode extends CstNode {
+ children: {
+ AttributeKey?: IToken[];
+ WsAfterKey?: IToken[];
+ Equals?: IToken[];
+ WsAfterEquals?: IToken[];
+ // Choose between one: john="doe", john='doe', john={{ template }}, for="i in items"
+ quotedValue?: CstQuotedTemplateNode[];
+ templatedValue?: CstTemplateNode[];
+ forIteratorValue?: CstForIteratorNode[];
+ };
+}
+
+/**
+ * Represents a prefix partial of opening tag in POML markup.
+ *
+ * Open tags mark the beginning of an element that expects a corresponding
+ * closing tag. They may contain attributes that configure the element's
+ * behavior and appearance.
+ *
+ * This is an opening tag without the ending close bracket.
+ * Allow prefix sharing with SelfCloseElementNode.
+ *
+ * Examples:
+ * - ``, ``
+ * - Nested structure endings: ``, `
`
+ * - Any valid POML element closure
+ */
+export interface CstCloseTagNode extends CstNode {
+ children: {
+ ClosingOpenBracket?: IToken[];
+ WsAfterOpen?: IToken[];
+ TagName?: IToken[];
+ WsBeforeClose?: IToken[];
+ CloseBracket?: IToken[];
+ };
+}
+
+/**
+ * Represents a complete POML element with its content.
+ *
+ * Element nodes are high-level constructs that represent semantic POML
+ * components. They contain a tag name, which contains optional attributes,
+ * and may have child contents including other elements, text, or values.
+ *
+ * It should also support literal elements, which are special POML elements
+ * that treat their content as literal text without any template variable interpolation.
+ * Content is preserved exactly as written, useful for code samples or pre-formatted text.
+ *
+ * Alternatively, it also supports self-closing elements.
+ *
+ * Cases that apply:
+ * - Any elements: `...content...`
+ * - Output schemas with templates: `{{ schemaDefinition }}`
+ * - Literal text elements: `Literal {{ not_interpolated }}` (literal elements)
+ * - Self-closing elements: ``
+ * - Runtime configurations: ``
+ *
+ * Cases that do not apply:
+ * - Literal text content: plain text (use LiteralNode)
+ * - Template variables: `{{ var }}` (use TemplateNode)
+ * - Meta elements: `` tags (use MetaNode)
+ *
+ * Note:
+ * - Literal element node is different from elements which do not support nested tags
+ * (e.g., ). Literal element node is handled on the CST parsing stage.
+ */
+export interface ElementNode extends AstNode {
+ kind: 'ELEMENT';
+ name: string;
+ attributes: AttributeNode[];
+ // Children is undefined for self-closing tags.
+ // If it's not self-closing, children is at least an empty array.
+ children?: ElementContentNode[];
+}
+
+export type ElementContentNode = ElementNode | CommentNode | PragmaNode | LiteralNode | TemplateNode;
+
+/**
+ * Related CST node interfaces for parsing stage.
+ */
+export interface CstElementNode extends CstNode {
+ children: {
+ OpenTagPartial?: CstOpenTagPartialNode[];
+ OpenTagCloseBracket?: IToken[];
+ Content?: CstElementContentNode[];
+ // For literal elements like
+ // When `` is used, the parser eats everything including tags and comments,
+ // including nested `` itself, until a matching `` is found
+ // The tagName can only be "text" and "template" for literal elements
+ // If you need `` in your POML content, use `<text>` outside of literal elements
+ TextContent?: CstTokens[]; // For literal elements like
+ CloseTag?: CstCloseTagNode[];
+ // Alternative, it can also be a self-closing tag.
+ SelfCloseBracket?: IToken[];
+ };
+}
+
+export interface CstElementContentNode extends CstNode {
+ children: {
+ Element?: CstElementNode[];
+ Comment?: CstCommentNode[];
+ Pragma?: CstPragmaNode[];
+ Template?: CstTemplateNode[];
+ TextContent?: CstTokens[];
+ };
+}
+
+/**
+ * Represents an HTML-like line/block comment in POML.
+ *
+ * Comment nodes preserve authoring notes or disabled content that should not
+ * affect rendering. The `value` holds the comment text without the ``
+ * delimiters.
+ *
+ * Examples:
+ * - ``
+ */
+export interface CommentNode extends AstNode {
+ kind: 'COMMENT';
+ value: LiteralNode;
+}
+
+/**
+ * Related CST node interfaces for parsing stage.
+ */
+export interface CstCommentNode extends CstNode {
+ children: {
+ CommentOpen?: IToken[];
+ Content?: CstTokens[];
+ CommentClose?: IToken[];
+ };
+}
+
+/**
+ * Represents a pragma directive carried inside a comment.
+ *
+ * Pragmas are special instructions for parser/compiler. They usually appear
+ * inside comments and start with `@pragma`. For now we keep this node simple
+ * with a single `value` that contains the full directive text after
+ * `@pragma` (e.g. `components +reference -table`).
+ *
+ * Examples:
+ * - Specify version: ``
+ * - Turn tags on/off: ``
+ * - Turn speaker roles on/off: `` or `single`
+ * - White space policy: `` or `trim`, `collapse`
+ *
+ * Notes on white space policy:
+ * - `pre`: preserve all whitespace as-is
+ * - `trim`: trim leading/trailing whitespace in each element
+ * - `collapse`: trim + collapse consecutive whitespace into a single space
+ * If there are two inline="false" elements next to each other, space between them will be deleted.
+ *
+ * Each element type will have its own default whitespace policy.
+ * For example, `` defaults to `pre`, while `` defaults to `collapse`.
+ * However, when a pragma is set, it overrides the default for subsequent elements.
+ * It will affect the AST constructing stages, and also affecting the props sent to components.
+ */
+export interface PragmaNode extends AstNode {
+ kind: 'PRAGMA';
+ identifier: LiteralNode;
+ options: LiteralNode[];
+}
+
+/**
+ * Related CST node interfaces for parsing stage.
+ */
+export interface CstPragmaNode extends CstNode {
+ children: {
+ CommentOpen?: IToken[];
+ WsAfterOpen?: IToken[];
+ PragmaKeyword?: IToken[];
+ WsAfterPragma?: IToken[];
+ PragmaIdentifier?: IToken[];
+ WsBeforeEachOption?: IToken[];
+ PragmaOption?: (IToken | CstQuotedNode)[];
+ WsAfterAll?: IToken[];
+ CommentClose?: IToken[];
+ };
+}
+
+/**
+ * Represents the root node of a POML document tree.
+ *
+ * Root nodes serve as the top-level container for all document content when
+ * there isn't an explicit `` wrapper. They provide a consistent entry
+ * point for document traversal and processing.
+ *
+ * Cases that apply:
+ * - Documents without `` wrapper
+ * - Documents with multiple top-level elements
+ * - Documents with `` but surrounded by white spaces or comments
+ *
+ * Cases that do not apply:
+ * - All nested elements
+ */
+export interface RootNode extends AstNode {
+ kind: 'ROOT';
+ children: ElementContentNode[];
+}
+
+/**
+ * Related CST node interfaces for parsing stage.
+ */
+export interface CstRootNode extends CstNode {
+ children: {
+ Content?: CstElementContentNode[];
+ };
+}
+
+// Keep these keys required; everything else becomes recursively optional
+type DeepPartialExcept =
+ // arrays
+ T extends (infer U)[]
+ ? DeepPartialExcept[]
+ : // functions (leave as-is)
+ T extends (...args: any) => any
+ ? T
+ : // objects
+ T extends object
+ ? { [P in keyof T as P extends K ? P : never]-?: T[P] } & {
+ [P in keyof T as P extends K ? never : P]?: DeepPartialExcept | undefined;
+ }
+ : T;
+
+// Keep only "kind" required; everything else is optional, recursively.
+type Draft = DeepPartialExcept;
+
+// Union of your strict nodes
+export type StrictNode =
+ | TemplateNode
+ | LiteralNode
+ | ValueNode
+ | ForIteratorNode
+ | AttributeNode
+ | ElementNode
+ | CommentNode
+ | PragmaNode
+ | RootNode;
+
+// The "loose" counterpart you can safely produce during parsing.
+export type DraftNode = Draft;
diff --git a/packages/poml/next/source.ts b/packages/poml/next/source.ts
new file mode 100644
index 00000000..50a09fb3
--- /dev/null
+++ b/packages/poml/next/source.ts
@@ -0,0 +1,115 @@
+import * as fs from 'fs';
+import { SourceFileCache, Position } from './types';
+
+export class SourceManager {
+ private sourceCache = new Map();
+ private currentSourceFile?: string;
+ private currentSourceContent?: string;
+
+ /**
+ * Set the current source file context for subsequent errors
+ */
+ public setCurrentFile(sourceFile: string, content?: string): void {
+ this.currentSourceFile = sourceFile;
+ this.currentSourceContent = content;
+
+ if (content && sourceFile) {
+ this.cacheSource(sourceFile, content);
+ }
+ }
+
+ /**
+ * Clear current file context
+ */
+ public clearCurrentFile(): void {
+ this.currentSourceFile = undefined;
+ this.currentSourceContent = undefined;
+ }
+
+ public getCurrentFile(): string | undefined {
+ return this.currentSourceFile;
+ }
+
+ public getCurrentFileContent(): string | undefined {
+ return this.currentSourceContent;
+ }
+
+ /**
+ * Clear all
+ */
+ public clear(): void {
+ this.sourceCache.clear();
+ this.clearCurrentFile();
+ }
+
+ /**
+ * Cache source file content
+ */
+ private cacheSource(file: string, content: string): void {
+ const lines = content.split('\n');
+ const lineStarts: number[] = [0];
+
+ let pos = 0;
+ for (const line of lines) {
+ pos += line.length + 1; // +1 for newline
+ lineStarts.push(pos);
+ }
+
+ this.sourceCache.set(file, {
+ content,
+ lines,
+ lineStarts,
+ });
+ }
+
+ /**
+ * Load source file if not cached
+ */
+ public loadSource(file: string): SourceFileCache | null {
+ if (this.sourceCache.has(file)) {
+ return this.sourceCache.get(file)!;
+ }
+
+ try {
+ const content = fs.readFileSync(file, 'utf8');
+ this.cacheSource(file, content);
+ return this.sourceCache.get(file)!;
+ } catch (error) {
+ return null;
+ }
+ }
+
+ /**
+ * Convert byte position to line/column
+ */
+ public indexToPosition(source: SourceFileCache, index: number): Position {
+ const { lineStarts } = source;
+
+ // Binary search for the line
+ let line = 0;
+ let left = 0;
+ let right = lineStarts.length - 1;
+
+ while (left < right) {
+ const mid = Math.floor((left + right + 1) / 2);
+ if (lineStarts[mid] <= index) {
+ left = mid;
+ } else {
+ right = mid - 1;
+ }
+ }
+
+ line = left;
+ const column = index - lineStarts[line];
+
+ return {
+ line: line + 1, // 1-based
+ column: column + 1, // 1-based
+ index,
+ };
+ }
+}
+
+// Create singleton instance
+const sourceManager = new SourceManager();
+export default sourceManager;
diff --git a/packages/poml/next/types.ts b/packages/poml/next/types.ts
new file mode 100644
index 00000000..ce6ac3b5
--- /dev/null
+++ b/packages/poml/next/types.ts
@@ -0,0 +1,48 @@
+/**
+ * Range in source file (byte positions)
+ */
+export interface Range {
+ start: number;
+ end: number;
+}
+
+/**
+ * Error severity levels
+ */
+export enum Severity {
+ ERROR = 'error',
+ WARNING = 'warning',
+ INFO = 'info',
+}
+
+/**
+ * Diagnostic interface
+ */
+export interface Diagnostic {
+ severity: Severity;
+ message: string;
+ sourceFile?: string;
+ range?: Range;
+ code?: string;
+ hint?: string;
+ originalError?: Error;
+}
+
+/**
+ * Position with line and column
+ */
+export interface Position {
+ line: number;
+ column: number;
+ index: number;
+}
+
+/**
+ * Source file cache entry
+ */
+export interface SourceFileCache {
+ filePath?: string;
+ content: string;
+ lines: string[];
+ lineStarts: number[];
+}
diff --git a/packages/poml/reader/index.tsx b/packages/poml/reader/index.tsx
deleted file mode 100644
index 627c20c8..00000000
--- a/packages/poml/reader/index.tsx
+++ /dev/null
@@ -1,3 +0,0 @@
-import { Reader } from './base';
-
-class DispatchReader extends Reader {}
diff --git a/packages/poml/reader/meta.ts b/packages/poml/reader/meta.ts
deleted file mode 100644
index 2164c6d3..00000000
--- a/packages/poml/reader/meta.ts
+++ /dev/null
@@ -1,3 +0,0 @@
-import { Reader } from './base';
-
-class MetaReader extends Reader {}
diff --git a/packages/poml/reader/poml.tsx b/packages/poml/reader/poml.tsx
deleted file mode 100644
index 5a771265..00000000
--- a/packages/poml/reader/poml.tsx
+++ /dev/null
@@ -1,3 +0,0 @@
-import { Reader } from './base';
-
-export class PomlReader extends Reader {}
diff --git a/packages/poml/reader/segment.ts b/packages/poml/reader/segment.ts
deleted file mode 100644
index cf4e4425..00000000
--- a/packages/poml/reader/segment.ts
+++ /dev/null
@@ -1,21 +0,0 @@
-export interface Segment {
- // Unique ID for caching and React keys
- id: string;
- kind: 'META' | 'TEXT' | 'POML';
- start: number;
- end: number;
- // The raw string content of the segment
- content: string;
- // The path to the file or resource this segment belongs to
- path?: string;
- // Reference to the parent segment
- parent?: Segment;
- // Nested segments (e.g., a POML block within text)
- children: Segment[];
- // For POML segments, the name of the root tag (e.g., 'task')
- tagName?: string;
-}
-
-export function createSegments(content: string, path?: string): Segment[] {
- throw new Error('createSegments is not implemented yet');
-}
diff --git a/packages/poml/reader/text.tsx b/packages/poml/reader/text.tsx
deleted file mode 100644
index 2b2e25ca..00000000
--- a/packages/poml/reader/text.tsx
+++ /dev/null
@@ -1,3 +0,0 @@
-import { Reader } from './base';
-
-export class PureTextReader extends Reader {}
diff --git a/packages/poml/tests/reader/ast.test.ts b/packages/poml/tests/reader/ast.test.ts
new file mode 100644
index 00000000..6d1c1d54
--- /dev/null
+++ b/packages/poml/tests/reader/ast.test.ts
@@ -0,0 +1,810 @@
+import { describe, expect, test, beforeEach } from '@jest/globals';
+import { extendedPomlLexer } from 'poml/next/lexer';
+import { ExtendedPomlParser } from 'poml/next/cst';
+import { cstToAst, ExtendedPomlAstVisitor } from 'poml/next/ast';
+import * as diagnostics from 'poml/next/diagnostics';
+import {
+ RootNode,
+ ElementNode,
+ LiteralNode,
+ TemplateNode,
+ ValueNode,
+ PragmaNode,
+ CommentNode,
+ ForIteratorNode,
+ AttributeNode,
+} from 'poml/next/nodes';
+import { CstNode } from 'chevrotain';
+
+// Helper function to lex, parse and build AST from raw input
+function parseToAst(input: string): RootNode {
+ // Clear diagnostics before each test
+ diagnostics.clear();
+
+ // Tokenize
+ const lexResult = extendedPomlLexer.tokenize(input);
+ expect(lexResult.errors).toHaveLength(0);
+
+ // Parse to CST
+ const parser = new ExtendedPomlParser();
+ parser.input = lexResult.tokens;
+ const cst = parser.root();
+ expect(parser.errors).toHaveLength(0);
+
+ // Convert to AST
+ return cstToAst(cst);
+}
+
+// Helper to parse specific rule and convert to AST
+function parseRule(input: string, rule: (parser: ExtendedPomlParser) => CstNode): T {
+ diagnostics.clear();
+
+ const lexResult = extendedPomlLexer.tokenize(input);
+ expect(lexResult.errors).toHaveLength(0);
+
+ const parser = new ExtendedPomlParser();
+ parser.input = lexResult.tokens;
+ const cst = rule(parser);
+ expect(parser.errors).toHaveLength(0);
+
+ const visitor = new ExtendedPomlAstVisitor();
+ return visitor.visit(cst) as T;
+}
+
+describe('AST Visitor - Individual Rules', () => {
+ beforeEach(() => {
+ diagnostics.clear();
+ });
+
+ describe('root rule', () => {
+ test('empty root', () => {
+ const result = parseToAst('');
+ expect(result).toStrictEqual({
+ kind: 'ROOT',
+ children: [],
+ range: { start: 0, end: 0 },
+ });
+ });
+
+ test('text only root', () => {
+ const result = parseToAst('Hello World');
+ expect(result).toStrictEqual({
+ kind: 'ROOT',
+ children: [
+ {
+ kind: 'STRING',
+ value: 'Hello World',
+ range: { start: 0, end: 10 },
+ },
+ ],
+ range: { start: 0, end: 10 },
+ });
+ });
+
+ test('mixed content root', () => {
+ const result = parseToAst('Hello {{ name }}!');
+ expect(result.kind).toBe('ROOT');
+ expect(result.children).toHaveLength(3);
+
+ expect(result.children[0]).toStrictEqual({
+ kind: 'STRING',
+ value: 'Hello ',
+ range: { start: 0, end: 5 },
+ });
+
+ expect(result.children[1]).toStrictEqual({
+ kind: 'TEMPLATE',
+ value: { kind: 'STRING', value: 'name', range: expect.any(Object) },
+ range: expect.any(Object),
+ });
+
+ expect(result.children[2]).toStrictEqual({
+ kind: 'STRING',
+ value: '!',
+ range: { start: 16, end: 16 },
+ });
+ });
+ });
+
+ describe('template rule', () => {
+ test('simple template', () => {
+ const result = parseRule('{{ var }}', (p) => p.template());
+ expect(result).toStrictEqual({
+ kind: 'TEMPLATE',
+ value: {
+ kind: 'STRING',
+ value: 'var',
+ range: expect.any(Object),
+ },
+ range: expect.any(Object),
+ });
+ });
+
+ test('complex expression template', () => {
+ const result = parseRule('{{ user.name.toUpperCase() }}', (p) => p.template());
+ expect(result.value.value).toBe('user.name.toUpperCase()');
+ });
+
+ test('template without spaces', () => {
+ const result = parseRule('{{count}}', (p) => p.template());
+ expect(result.value.value).toBe('count');
+ });
+ });
+
+ describe('comment rule', () => {
+ test('simple comment', () => {
+ const result = parseRule('', (p) => p.comment());
+ expect(result).toStrictEqual({
+ kind: 'COMMENT',
+ value: {
+ kind: 'STRING',
+ value: ' hello ',
+ range: expect.any(Object),
+ },
+ range: expect.any(Object),
+ });
+ });
+
+ test('multiline comment', () => {
+ const result = parseRule('', (p) => p.comment());
+ expect(result.value.value).toContain('line 1');
+ expect(result.value.value).toContain('line 2');
+ });
+ });
+
+ describe('pragma rule', () => {
+ test('pragma with identifier only', () => {
+ const result = parseRule('', (p) => p.pragma());
+ expect(result).toStrictEqual({
+ kind: 'PRAGMA',
+ identifier: {
+ kind: 'STRING',
+ value: 'version',
+ range: { start: 13, end: 19 },
+ },
+ options: [],
+ range: { start: 0, end: 24 },
+ });
+ });
+
+ test('pragma with unquoted options', () => {
+ const result = parseRule('', (p) => p.pragma());
+ expect(result).toStrictEqual({
+ kind: 'PRAGMA',
+ identifier: {
+ kind: 'STRING',
+ value: 'components',
+ range: { start: 13, end: 22 },
+ },
+ options: [
+ { kind: 'STRING', value: '+reference', range: { start: 24, end: 33 } },
+ { kind: 'STRING', value: '-table', range: { start: 35, end: 40 } },
+ ],
+ range: { start: 0, end: 44 },
+ });
+ });
+
+ test('pragma with quoted options', () => {
+ const result = parseRule('', (p) => p.pragma());
+ expect(result).toStrictEqual({
+ kind: 'PRAGMA',
+ identifier: {
+ kind: 'STRING',
+ value: 'whitespace',
+ range: { start: 13, end: 22 },
+ },
+ options: [{ kind: 'STRING', value: 'pre formatted', range: { start: 24, end: 38 } }],
+ range: { start: 0, end: 42 },
+ });
+ });
+ });
+
+ describe('quoted rule', () => {
+ test('simple quoted string', () => {
+ const result = parseRule('"hello world"', (p) => p.quoted());
+ expect(result).toStrictEqual({
+ kind: 'STRING',
+ value: 'hello world',
+ range: expect.any(Object),
+ });
+ });
+
+ test('quoted string with single quotes', () => {
+ const result = parseRule("'hello world'", (p) => p.quoted());
+ expect(result).toStrictEqual({
+ kind: 'STRING',
+ value: 'hello world',
+ range: expect.any(Object),
+ });
+ });
+ });
+
+ describe('quotedTemplate rule', () => {
+ test('quoted string with template', () => {
+ const result = parseRule('"Hello {{ name }}!"', (p) => p.quotedTemplate());
+ expect(result).toStrictEqual({
+ kind: 'VALUE',
+ children: [
+ { kind: 'STRING', value: 'Hello ', range: { start: 1, end: 6 } },
+ {
+ kind: 'TEMPLATE',
+ value: { kind: 'STRING', value: 'name', range: { start: 10, end: 13 } },
+ range: { start: 8, end: 15 },
+ },
+ { kind: 'STRING', value: '!', range: { start: 16, end: 16 } },
+ ],
+ range: { start: 0, end: 18 },
+ });
+ });
+
+ test('quoted template with only template', () => {
+ const result = parseRule('"{{ expression }}"', (p) => p.quotedTemplate());
+ expect(result).toStrictEqual({
+ kind: 'VALUE',
+ children: [
+ {
+ kind: 'TEMPLATE',
+ value: { kind: 'STRING', value: 'expression', range: { start: 3, end: 12 } },
+ range: { start: 1, end: 16 },
+ },
+ ],
+ range: { start: 0, end: 17 },
+ });
+ });
+
+ test('multiple templates in quoted string', () => {
+ const result = parseRule('"{{ first }} and {{ second }}"', (p) => p.quotedTemplate());
+ expect(result).toStrictEqual({
+ kind: 'VALUE',
+ children: [
+ {
+ kind: 'TEMPLATE',
+ value: { kind: 'STRING', value: 'first', range: { start: 4, end: 8 } },
+ range: { start: 1, end: 11 },
+ },
+ { kind: 'STRING', value: ' and ', range: { start: 12, end: 16 } },
+ {
+ kind: 'TEMPLATE',
+ value: { kind: 'STRING', value: 'second', range: { start: 20, end: 25 } },
+ range: { start: 17, end: 28 },
+ },
+ ],
+ range: { start: 0, end: 29 },
+ });
+ });
+ });
+
+ describe('forIteratorValue rule', () => {
+ test('simple for iterator', () => {
+ const result = parseRule('"item in items"', (p) => p.forIteratorValue());
+ expect(result).toStrictEqual({
+ kind: 'FORITERATOR',
+ iterator: {
+ kind: 'STRING',
+ value: 'item',
+ range: { start: 1, end: 4 },
+ },
+ collection: {
+ kind: 'STRING',
+ value: 'items',
+ range: { start: 9, end: 13 },
+ },
+ range: { start: 0, end: 14 },
+ });
+ });
+
+ test('for iterator with property access', () => {
+ const result = parseRule('"user in data.users"', (p) => p.forIteratorValue());
+ expect(result).toStrictEqual({
+ kind: 'FORITERATOR',
+ iterator: { kind: 'STRING', value: 'user', range: { start: 1, end: 4 } },
+ collection: { kind: 'STRING', value: 'data.users', range: { start: 9, end: 18 } },
+ range: { start: 0, end: 19 },
+ });
+ });
+
+ test('for iterator with complex expression', () => {
+ const result = parseRule('"item in getItems().filter(x => x.active)"', (p) =>
+ p.forIteratorValue(),
+ );
+ expect(result.collection.value).toBe('getItems().filter(x => x.active)');
+ });
+ });
+
+ describe('attribute rule', () => {
+ test('attribute with quoted value', () => {
+ const result = parseRule('class="container"', (p) => p.attribute());
+ expect(result).toStrictEqual({
+ kind: 'ATTRIBUTE',
+ key: { kind: 'STRING', value: 'class', range: { start: 0, end: 4 } },
+ value: {
+ kind: 'VALUE',
+ children: [{ kind: 'STRING', value: 'container', range: { start: 6, end: 16 } }],
+ range: { start: 6, end: 16 },
+ },
+ range: { start: 0, end: 17 },
+ });
+ });
+
+ test('attribute with template value', () => {
+ const result = parseRule('title={{ pageTitle }}', (p) => p.attribute());
+ expect(result).toStrictEqual({
+ kind: 'ATTRIBUTE',
+ key: { kind: 'STRING', value: 'title', range: { start: 0, end: 4 } },
+ value: {
+ kind: 'VALUE',
+ children: [
+ {
+ kind: 'TEMPLATE',
+ value: { kind: 'STRING', value: 'pageTitle', range: { start: 9, end: 17 } },
+ range: { start: 6, end: 20 },
+ },
+ ],
+ range: { start: 6, end: 20 },
+ },
+ range: { start: 0, end: 20 },
+ });
+ });
+
+ test('attribute with for iterator', () => {
+ const result = parseRule('for="item in items"', (p) => p.attribute());
+ expect(result).toStrictEqual({
+ kind: 'ATTRIBUTE',
+ key: { kind: 'STRING', value: 'for', range: { start: 0, end: 2 } },
+ value: {
+ kind: 'FORITERATOR',
+ iterator: { kind: 'STRING', value: 'item', range: { start: 5, end: 8 } },
+ collection: { kind: 'STRING', value: 'items', range: { start: 13, end: 17 } },
+ range: { start: 4, end: 18 },
+ },
+ range: { start: 0, end: 18 },
+ });
+ });
+
+ test('attribute with quoted template value', () => {
+ const result = parseRule('message="Hello {{ name }}!"', (p) => p.attribute());
+ expect(result).toStrictEqual({
+ kind: 'ATTRIBUTE',
+ key: { kind: 'STRING', value: 'message', range: { start: 0, end: 6 } },
+ value: {
+ kind: 'VALUE',
+ children: [
+ { kind: 'STRING', value: 'Hello ', range: { start: 9, end: 14 } },
+ {
+ kind: 'TEMPLATE',
+ value: { kind: 'STRING', value: 'name', range: { start: 18, end: 21 } },
+ range: { start: 15, end: 24 },
+ },
+ { kind: 'STRING', value: '!', range: { start: 25, end: 25 } },
+ ],
+ range: { start: 8, end: 26 },
+ },
+ range: { start: 0, end: 26 },
+ });
+ });
+ });
+
+ describe('element rule', () => {
+ test('simple element', () => {
+ const result = parseRule('content
', (p) => p.element());
+ expect(result).toStrictEqual({
+ kind: 'ELEMENT',
+ name: 'div',
+ attributes: [],
+ children: [{ kind: 'STRING', value: 'content', range: { start: 5, end: 11 } }],
+ range: { start: 0, end: 17 },
+ });
+ });
+
+ test('element with attributes', () => {
+ const result = parseRule('text
', (p) => p.element());
+ expect(result).toStrictEqual({
+ kind: 'ELEMENT',
+ name: 'div',
+ attributes: [
+ {
+ kind: 'ATTRIBUTE',
+ key: { kind: 'STRING', value: 'class', range: { start: 5, end: 9 } },
+ value: {
+ kind: 'VALUE',
+ children: [{ kind: 'STRING', value: 'container', range: { start: 12, end: 20 } }],
+ range: { start: 11, end: 21 },
+ },
+ range: { start: 5, end: 21 },
+ },
+ {
+ kind: 'ATTRIBUTE',
+ key: { kind: 'STRING', value: 'id', range: { start: 23, end: 24 } },
+ value: {
+ kind: 'VALUE',
+ children: [{ kind: 'STRING', value: 'main', range: { start: 27, end: 30 } }],
+ range: { start: 26, end: 31 },
+ },
+ range: { start: 23, end: 31 },
+ },
+ ],
+ children: [{ kind: 'STRING', value: 'text', range: { start: 33, end: 36 } }],
+ range: { start: 0, end: 42 },
+ });
+ });
+
+ test('self-closing element', () => {
+ const result = parseRule('
', (p) => p.element());
+ expect(result).toStrictEqual({
+ kind: 'ELEMENT',
+ name: 'img',
+ attributes: [
+ {
+ kind: 'ATTRIBUTE',
+ key: { kind: 'STRING', value: 'src', range: { start: 5, end: 7 } },
+ value: {
+ kind: 'VALUE',
+ children: [{ kind: 'STRING', value: 'photo.jpg', range: { start: 10, end: 18 } }],
+ range: { start: 9, end: 19 },
+ },
+ range: { start: 5, end: 19 },
+ },
+ ],
+ children: [],
+ range: { start: 0, end: 22 },
+ });
+ });
+
+ test('element with nested content', () => {
+ const result = parseRule('Process {{ data }} carefully', (p) => p.element());
+ expect(result.children).toHaveLength(3);
+ expect(result.children[0]).toStrictEqual({ kind: 'STRING', value: 'Process ', range: { start: 6, end: 13 } });
+ expect(result.children[1]).toStrictEqual({
+ kind: 'TEMPLATE',
+ value: { kind: 'STRING', value: 'data', range: { start: 17, end: 20 } },
+ range: { start: 14, end: 23 },
+ });
+ expect(result.children[2]).toStrictEqual({ kind: 'STRING', value: ' carefully', range: { start: 24, end: 33 } });
+ });
+
+ test('nested elements', () => {
+ const result = parseRule('nested
', (p) => p.element());
+ expect(result.children).toHaveLength(1);
+ expect(result.children[0]).toStrictEqual({
+ kind: 'ELEMENT',
+ name: 'span',
+ attributes: [],
+ children: [{ kind: 'STRING', value: 'nested', range: { start: 11, end: 16 } }],
+ range: { start: 5, end: 23 },
+ });
+ });
+ });
+});
+
+describe('AST Visitor - Error Handling', () => {
+ beforeEach(() => {
+ diagnostics.clear();
+ });
+
+ test('mismatched closing tag reports error', () => {
+ const input = 'content';
+ parseRule(input, (p) => p.element());
+
+ const errors = diagnostics.getErrors();
+ expect(errors).toHaveLength(1);
+ expect(errors[0].message).toContain('Mismatched closing tag');
+ expect(errors[0].message).toContain('expected
');
+ expect(errors[0].message).toContain('found ');
+ });
+
+ test('invalid HTML entity reports error', () => {
+ const input = '&invalidEntity;';
+ parseToAst(input);
+
+ const errors = diagnostics.getErrors();
+ expect(errors).toHaveLength(1);
+ expect(errors[0].message).toContain('Failed to decode HTML entity');
+ });
+
+ test('attribute without value reports error', () => {
+ // This would be caught during parsing, but if we had a malformed CST:
+ const input = 'content
';
+ // Note: This test might need adjustment based on actual parser behavior
+ try {
+ parseRule(input, (p) => p.element());
+ const errors = diagnostics.getErrors();
+ // Check if any errors were reported for missing attribute value
+ } catch (e) {
+ // Parser error expected for malformed syntax
+ expect(true).toBe(true);
+ }
+ });
+
+ test('unknown element content reports error', () => {
+ // This tests the fallback case in elementContent
+ diagnostics.clear();
+ parseToAst('normal text'); // Should not cause errors
+ expect(diagnostics.getErrors()).toHaveLength(0);
+ });
+});
+
+describe('AST Visitor - Special Tokens and Escapes', () => {
+ beforeEach(() => {
+ diagnostics.clear();
+ });
+
+ describe('backslash escapes in quoted strings', () => {
+ test('basic escape sequences', () => {
+ const result = parseRule('"line1\\nline2"', (p) => p.quoted());
+ expect(result.value).toBe('line1\nline2');
+ });
+
+ test('unicode escape sequences', () => {
+ const result = parseRule('"\\u0048\\u0065\\u006C\\u006C\\u006F"', (p) => p.quoted()); // "Hello"
+ expect(result.value).toBe('Hello');
+ });
+
+ test('hex escape sequences', () => {
+ const result = parseRule('"\\x48\\x65\\x6C\\x6C\\x6F"', (p) => p.quoted()); // "Hello"
+ expect(result.value).toBe('Hello');
+ });
+
+ test('quote escapes', () => {
+ const result = parseRule('"\\"escaped quotes\\""', (p) => p.quoted());
+ expect(result.value).toBe('"escaped quotes"');
+ });
+
+ test('template brace escapes', () => {
+ const result = parseRule('"\\{{not a template\\}}"', (p) => p.quoted());
+ expect(result.value).toBe('{{not a template}}');
+ });
+
+ test('backslash escape', () => {
+ const result = parseRule('"path\\\\to\\\\file"', (p) => p.quoted());
+ expect(result.value).toBe('path\\to\\file');
+ });
+
+ test('unknown escape sequence', () => {
+ const result = parseRule('"\\q unknown"', (p) => p.quoted());
+ expect(result.value).toBe('\\q unknown'); // Unknown escape returns body with backslash
+ });
+ });
+
+ describe('character entities in text content', () => {
+ test('common HTML entities', () => {
+ const result = parseToAst('& < > " '');
+ expect(result.children[0]).toMatchObject({
+ kind: 'STRING',
+ value: '& < > " \'',
+ });
+ });
+
+ test('numeric character references', () => {
+ const result = parseToAst('A A'); // Both represent 'A'
+ expect(result.children[0]).toMatchObject({
+ kind: 'STRING',
+ value: 'A A',
+ });
+ });
+
+ test('mixed entities and regular text', () => {
+ const result = parseToAst('Hello & welcome <user>');
+ expect(result.children[0]).toMatchObject({
+ kind: 'STRING',
+ value: 'Hello & welcome ',
+ });
+ });
+ });
+
+ describe('escapes in different contexts', () => {
+ test('backslash escapes not processed in text content', () => {
+ const result = parseToAst('This \\n should stay as literal');
+ expect(result.children[0]).toMatchObject({
+ kind: 'STRING',
+ value: 'This \\n should stay as literal',
+ });
+ });
+
+ test('entities not processed in quoted strings', () => {
+ const result = parseRule('"& stays literal"', (p) => p.quoted());
+ expect(result.value).toBe('& stays literal');
+ });
+
+ test('template expressions preserve content', () => {
+ const result = parseRule('{{ "string with { } \\n \n escape" }}', (p) => p.template());
+ expect(result.value.value).toBe('"string with { } \\n \n escape"');
+ });
+ });
+});
+
+// describe('AST Visitor - Complex Integration Tests', () => {
+// beforeEach(() => {
+// diagnostics.clear();
+// });
+
+// test('complex document with multiple element types', () => {
+// const input = `
+//
+//
+//
+//
+// Welcome to {{ appName }}!
+
+//
+// -
+// Task: {{ task.name }} - Status: {{ task.status }}
+//
+//
+//
+
+//
+// `;
+
+// const result = parseToAst(input);
+
+// // Root should contain pragma, whitespace, and element
+// expect(result.kind).toBe('ROOT');
+// expect(result.children.length).toBeGreaterThan(0);
+
+// // Find the pragma
+// const pragma = result.children.find(child => child.kind === 'PRAGMA') as PragmaNode;
+// expect(pragma).toBeDefined();
+// expect(pragma.identifier.value).toBe('whitespace');
+// expect(pragma.options[0].value).toBe('collapse');
+
+// // Find the document element
+// const document = result.children.find(child =>
+// child.kind === 'ELEMENT' && (child as ElementNode).name === 'document'
+// ) as ElementNode;
+// expect(document).toBeDefined();
+
+// // Document should have nested content
+// expect(document.children?.length).toBeGreaterThan(0);
+
+// // Find section with template
+// const section = document.children?.find(child =>
+// child.kind === 'ELEMENT' && (child as ElementNode).name === 'section'
+// ) as ElementNode;
+// expect(section).toBeDefined();
+// expect(section.attributes).toHaveLength(1);
+// expect(section.attributes[0].key.value).toBe('title');
+
+// // Check for template in section content
+// const templateInSection = section.children?.find(child => child.kind === 'TEMPLATE');
+// expect(templateInSection).toBeDefined();
+
+// // Find list with for attribute
+// const findElementByName = (children: any[], name: string): ElementNode | undefined =>
+// children.find(child => child.kind === 'ELEMENT' && child.name === name);
+
+// const list = findElementByName(section.children!, 'list');
+// expect(list).toBeDefined();
+
+// const item = findElementByName(list!.children!, 'item');
+// expect(item).toBeDefined();
+
+// // Check for attribute with for iterator
+// const forAttr = item!.attributes.find(attr => attr.key.value === 'for');
+// expect(forAttr).toBeDefined();
+// expect(forAttr!.value.kind).toBe('FORITERATOR');
+
+// const forIterator = forAttr!.value as ForIteratorNode;
+// expect(forIterator.iterator.value).toBe('task');
+// expect(forIterator.collection.value).toBe('tasks');
+
+// // Check footer with entity
+// const footer = findElementByName(document.children, 'footer');
+// expect(footer).toBeDefined();
+// expect(footer!.children[0]).toMatchObject({
+// kind: 'STRING',
+// value: '© 2024 Company', // Entity should be decoded
+// });
+// });
+
+// test('mixed content with templates, comments, and elements', () => {
+// const input = `
+// Processing data for {{ userName }}...
+//
+//
+// Task completed!
+// `;
+
+// const result = parseToAst(input);
+
+// expect(result.children).toHaveLength(6); // whitespace, text, template, text, comment, text, element, text
+
+// // Check template rendering
+// const firstTemplate = result.children.find(child => child.kind === 'TEMPLATE') as TemplateNode;
+// expect(firstTemplate).toBeDefined();
+// expect(firstTemplate.value.value).toContain('userName');
+
+// // Check comment
+// const comment = result.children.find(child => child.kind === 'COMMENT') as CommentNode;
+// expect(comment).toBeDefined();
+// expect(comment.value.value).toContain('Status:');
+
+// // Check progress element with template attributes
+// const progress = result.children.find(child =>
+// child.kind === 'ELEMENT' && (child as ElementNode).name === 'progress'
+// ) as ElementNode;
+// expect(progress).toBeDefined();
+// expect(progress.attributes).toHaveLength(2);
+
+// // Check template in attribute value
+// const valueAttr = progress.attributes.find(attr => attr.key.value === 'value');
+// expect(valueAttr!.value.kind).toBe('VALUE');
+// const valueNode = valueAttr!.value as ValueNode;
+// expect(valueNode.children[0].kind).toBe('TEMPLATE');
+
+// // Check template in element content
+// const progressTemplate = progress.children?.find(child => child.kind === 'TEMPLATE');
+// expect(progressTemplate).toBeDefined();
+// });
+
+// test('deeply nested structure with various features', () => {
+// const input = `
+//
+//
+//
+//
+
+//
+//
+// Analyze the provided dataset & generate insights.
+
+//
+// - Statistical analysis
+// - Data visualization
+// - {{ metric }} calculation
+//
+//
+
+//
+//
+//
+//
+// `;
+
+// const result = parseToAst(input);
+
+// expect(result.children).toHaveLength(1);
+// const poml = result.children[0] as ElementNode;
+// expect(poml.name).toBe('poml');
+// expect(poml.children.length).toBeGreaterThan(0);
+
+// // Verify deep nesting is preserved
+// const task = poml.children.find(child =>
+// child.kind === 'ELEMENT' && (child as ElementNode).name === 'task'
+// ) as ElementNode;
+// expect(task).toBeDefined();
+
+// const description = task.children.find(child =>
+// child.kind === 'ELEMENT' && (child as ElementNode).name === 'description'
+// ) as ElementNode;
+// expect(description).toBeDefined();
+
+// const requirements = description.children.find(child =>
+// child.kind === 'ELEMENT' && (child as ElementNode).name === 'requirements'
+// ) as ElementNode;
+// expect(requirements).toBeDefined();
+
+// // Check for iterator in nested structure
+// const forItem = requirements.children.find(child => {
+// if (child.kind !== 'ELEMENT') return false;
+// const elem = child as ElementNode;
+// return elem.name === 'item' && elem.attributes.some(attr => attr.key.value === 'for');
+// }) as ElementNode;
+
+// expect(forItem).toBeDefined();
+// const forAttr = forItem.attributes.find(attr => attr.key.value === 'for')!;
+// expect(forAttr.value.kind).toBe('FORITERATOR');
+
+// // Verify self-closing elements work
+// const config = poml.children.find(child =>
+// child.kind === 'ELEMENT' && (child as ElementNode).name === 'config'
+// ) as ElementNode;
+// expect(config).toBeDefined();
+
+// const model = config.children.find(child =>
+// child.kind === 'ELEMENT' && (child as ElementNode).name === 'model'
+// ) as ElementNode;
+// expect(model).toBeDefined();
+// expect(model.children).toHaveLength(0); // Self-closing
+// expect(model.attributes.length).toBeGreaterThan(0);
+// });
+// });
diff --git a/packages/poml/tests/reader/cst.test.ts b/packages/poml/tests/reader/cst.test.ts
new file mode 100644
index 00000000..e10377ef
--- /dev/null
+++ b/packages/poml/tests/reader/cst.test.ts
@@ -0,0 +1,918 @@
+import { describe, expect, test } from '@jest/globals';
+import { CstNode, IToken } from 'chevrotain';
+import { ExtendedPomlParser } from 'poml/next/cst';
+import { extendedPomlLexer, Whitespace, Identifier } from 'poml/next/lexer';
+import {
+ CstRootNode,
+ CstElementContentNode,
+ CstTemplateNode,
+ CstCommentNode,
+ CstPragmaNode,
+ CstQuotedNode,
+ CstQuotedTemplateNode,
+ CstForIteratorNode,
+ CstAttributeNode,
+ CstOpenTagPartialNode,
+ CstCloseTagNode,
+ CstElementNode,
+ CstTokens,
+} from 'poml/next/nodes';
+
+function withParser(input: string, run: (p: ExtendedPomlParser) => T, raiseOnError?: boolean) {
+ const lex = extendedPomlLexer.tokenize(input);
+ const parser = new ExtendedPomlParser();
+ parser.input = lex.tokens;
+ const node = run(parser);
+ if (raiseOnError || raiseOnError === undefined) {
+ expect(parser.errors).toHaveLength(0);
+ }
+ return { node, parser, tokens: lex.tokens, errors: parser.errors };
+}
+
+describe('CST Parser Rules', () => {
+ test('template rule produces CstTemplateNode', () => {
+ const { node } = withParser('{{ name }}', (p) => p.template()) as { node: CstTemplateNode };
+
+ expect(node.name).toBe('template');
+ expect(node.children.TemplateOpen?.[0].image).toBe('{{');
+ expect(node.children.Content).toBeDefined();
+ // Should have whitespace after open and before close when present
+ expect(node.children.WsAfterOpen?.[0].tokenType).toBe(Whitespace);
+ // nodes.ts expects WsAfterContent before close
+ expect(node.children.WsAfterContent?.[0].tokenType).toBe(Whitespace);
+ expect(node.children.TemplateClose?.[0].image).toBe('}}');
+ });
+
+ test('comment rule produces CstCommentNode', () => {
+ const { node } = withParser('', (p) => p.comment()) as { node: CstCommentNode };
+ expect(node.name).toBe('comment');
+ expect(node.children.CommentOpen?.[0].image).toBe('');
+ });
+
+ test('pragma rule produces CstPragmaNode', () => {
+ const input = '';
+ const { node } = withParser(input, (p) => p.pragma()) as { node: CstPragmaNode };
+
+ expect(node.name).toBe('pragma');
+ expect(node.children.CommentOpen?.[0].image).toBe('');
+ });
+
+ test('quoted rule produces CstQuotedNode (double and single)', () => {
+ const { node: node1 } = withParser('"hello"', (p) => p.quoted()) as { node: CstQuotedNode };
+ expect(node1.name).toBe('quoted');
+ expect(node1.children.OpenQuote?.[0].image).toBe('"');
+ // Content is a CstDoubleQuotedTokens node
+ const contentNode = node1.children.Content?.[0];
+ const contentText = contentNode?.children.Content?.map((t) => t.image).join('') || '';
+ expect(contentText).toBe('hello');
+ expect(node1.children.CloseQuote?.[0].image).toBe('"');
+
+ const { node: node2 } = withParser("'world'", (p) => p.quoted()) as { node: CstQuotedNode };
+ expect(node2.children.OpenQuote?.[0].image).toBe("'");
+ expect(node2.children.CloseQuote?.[0].image).toBe("'");
+ });
+
+ test('quotedTemplate rule produces CstQuotedTemplateNode', () => {
+ const input = '"Hello {{ name }}!"';
+ const { node } = withParser(input, (p) => p.quotedTemplate()) as { node: CstQuotedTemplateNode };
+ expect(node.name).toBe('quotedTemplate');
+ expect(node.children.OpenQuote?.[0].image).toBe('"');
+ expect(node.children.CloseQuote?.[0].image).toBe('"');
+ expect(node.children.Content?.length).toBeGreaterThan(0);
+ // Should include a template embedded inside
+ const hasTemplate = (node.children.Content || []).some(
+ (c: any) => typeof c === 'object' && c && 'name' in c && (c as any).name === 'template',
+ );
+ expect(hasTemplate).toBe(true);
+ });
+
+ test('forIteratorValue rule produces CstForIteratorNode', () => {
+ const input = '"item in items"';
+ const { node } = withParser(input, (p) => p.forIteratorValue()) as { node: CstForIteratorNode };
+ expect(node.name).toBe('forIteratorValue');
+ expect(node.children.OpenQuote?.[0].image).toBe('"');
+ expect(node.children.Iterator?.[0].image).toBe('item');
+ expect(node.children.InKeyword?.[0].tokenType).toBe(Identifier);
+ // nodes.ts expects Collection label for the expression part
+ expect(node.children.Collection?.length).toBeGreaterThan(0);
+ expect(node.children.CloseQuote?.[0].image).toBe('"');
+ });
+
+ test('attribute rule produces CstAttributeNode for plain, templated, and for-iterator values', () => {
+ // quoted value
+ let result = withParser('id="value"', (p) => p.attribute()) as { node: CstAttributeNode };
+ let node = result.node;
+ expect(node.name).toBe('attribute');
+ expect(node.children.AttributeKey?.[0].image).toBe('id');
+ expect(node.children.Equals?.[0].image).toBe('=');
+ expect(node.children.quotedValue?.[0]).toBeDefined();
+
+ // templated value
+ result = withParser('title={{ name }}', (p) => p.attribute()) as { node: CstAttributeNode };
+ node = result.node;
+ expect(node.children.AttributeKey?.[0].image).toBe('title');
+ expect(node.children.templatedValue?.[0]).toBeDefined();
+
+ // for-iterator value
+ result = withParser('for="i in items"', (p) => p.attribute()) as { node: CstAttributeNode };
+ node = result.node;
+ expect(node.children.AttributeKey?.[0].image.toLowerCase()).toBe('for');
+ expect(node.children.forIteratorValue?.[0]).toBeDefined();
+ });
+
+ test('openTagPartial rule returns extra fields and children', () => {
+ const { node } = withParser(' p.openTagPartial()) as {
+ node: CstOpenTagPartialNode;
+ };
+ expect(node.name).toBe('openTagPartial');
+ // Children
+ expect(node.children.OpenBracket?.[0].image).toBe('<');
+ expect(node.children.TagName?.[0].image.toLowerCase()).toBe('text');
+ expect(node.children.Attribute?.length).toBeGreaterThan(0);
+ });
+
+ test('closeTag rule produces CstCloseTagNode', () => {
+ const { node } = withParser('', (p) => p.closeTag()) as { node: CstCloseTagNode };
+ expect(node.name).toBe('closeTag');
+ expect(node.children.ClosingOpenBracket?.[0].image).toBe('');
+ expect(node.children.TagName?.[0].image.toLowerCase()).toBe('text');
+ expect(node.children.CloseBracket?.[0].image).toBe('>');
+ });
+
+ test('element rule: normal open/close element produces CstElementNode', () => {
+ const input = '{{x}}';
+ const { node } = withParser(input, (p) => p.element()) as { node: CstElementNode };
+ expect(node.name).toBe('element');
+ expect(node.children.OpenTagPartial?.[0]).toBeDefined();
+ expect(node.children.OpenTagCloseBracket?.[0].image).toBe('>');
+ expect(node.children.Content?.length).toBe(1);
+ const contentNode = node.children.Content?.[0] as CstElementContentNode;
+ expect(contentNode.name).toBe('elementContent');
+ const templateNode = contentNode.children.Template?.[0] as CstTemplateNode;
+ expect(templateNode.name).toBe('template');
+ expect(templateNode.children.TemplateOpen?.[0].image).toBe('{{');
+ expect(templateNode.children.Content?.[0].children.Content?.[0].image).toBe('x');
+ expect(templateNode.children.TemplateClose?.[0].image).toBe('}}');
+ expect(node.children.CloseTag?.[0]).toBeDefined();
+ });
+
+ test('element rule: self-closing element', () => {
+ const { node } = withParser('', (p) => p.element()) as { node: CstElementNode };
+ expect(node.children.OpenTagPartial?.[0]).toBeDefined();
+ const openTag = node.children.OpenTagPartial?.[0] as CstOpenTagPartialNode;
+ expect(openTag.children.OpenBracket?.[0].image).toBe('<');
+ expect(openTag.children.TagName?.[0].image).toBe('meta');
+ expect(openTag.children.WsAfterAll?.[0].image).toBe(' ');
+ expect(node.children.SelfCloseBracket?.[0].image).toBe('/>');
+ });
+
+ test('element rule: literal element treats content as TextContent', () => {
+ const input = 'Hello {{ name }} ';
+ const { node } = withParser(input, (p) => p.element()) as { node: CstElementNode };
+ expect(node.children.OpenTagPartial?.[0]).toBeDefined();
+ expect(node.children.OpenTagCloseBracket?.[0].image).toBe('>');
+ // Literal elements should store raw tokens under TextContent (no Template child)
+ expect(node.children.TextContent?.length).toBeGreaterThan(0);
+ const content = node.children.TextContent?.[0] as CstTokens;
+ const images = content.children.Content?.map((t) => t.image) || [];
+ expect(images).toContain('{{');
+ expect(images).toContain('}}');
+ expect(images).toContain('<');
+ expect(images).toContain('text');
+ expect(images).toContain('>');
+ expect(images[images.length - 1]).toBe(' ');
+ expect(node.children.CloseTag?.[0]).toBeDefined();
+ const closeTag = node.children.CloseTag?.[0] as CstCloseTagNode;
+ expect(closeTag.children.TagName?.[0].image).toBe('text');
+ });
+
+ test('elementContent rule produces CstElementContentNode with text', () => {
+ const { node } = withParser('hello world', (p) => p.elementContent()) as {
+ node: CstElementContentNode;
+ };
+ expect(node.name).toBe('elementContent');
+ expect(node.children.TextContent?.length).toBeGreaterThan(0);
+ });
+
+ test('root rule produces CstRootNode with Content', () => {
+ const input = 't{{x}}';
+ const { node } = withParser(input, (p) => p.root()) as { node: CstRootNode };
+ expect(node.name).toBe('root');
+ expect(node.children.Content?.length).toBeGreaterThan(0);
+
+ // Sanity: ensure CST contains an element somewhere
+ const contentNodes = node.children.Content || [];
+ const elementNames = contentNodes.map((n) => (n as any).name);
+ expect(elementNames).toContain('elementContent');
+
+ expect(node.location).toEqual({
+ startOffset: 0,
+ startLine: 1,
+ startColumn: 1,
+ endOffset: 70,
+ endLine: 1,
+ endColumn: 71,
+ });
+ });
+});
+
+describe('Special Tokens', () => {
+ test('root document with no root tags', () => {
+ const input = `Hello {{ user }}!
+ Some text arbi&rary; symbols\\etc/>
+
+done`;
+ const { node } = withParser(input, (p) => p.root()) as { node: CstRootNode };
+ expect(images(node)).toStrictEqual([
+ { TextContent: 'Hello ' },
+ {
+ Template: {
+ TemplateOpen: '{{',
+ WsAfterOpen: ' ',
+ Content: 'user',
+ WsAfterContent: ' ',
+ TemplateClose: '}}',
+ },
+ },
+ { TextContent: '!\n' },
+ {
+ Comment: {
+ CommentOpen: '',
+ },
+ },
+ { TextContent: ' ' },
+ {
+ Element: {
+ OpenTagPartial: { OpenBracket: '<', TagName: 'text' },
+ OpenTagCloseBracket: '>',
+ TextContent: 'Some text arbi&rary; symbols\\etc/>',
+ CloseTag: { ClosingOpenBracket: '', TagName: 'text', CloseBracket: '>' },
+ },
+ },
+ { TextContent: '\n\ndone' },
+ ]);
+
+ expect(names(node)).toStrictEqual({
+ name: 'root',
+ children: [
+ {
+ name: 'elementContent',
+ children: { TextContent: { name: 'betweenTagsTokens' } },
+ },
+ {
+ name: 'elementContent',
+ children: {
+ Template: { name: 'template', children: { name: 'expressionTokens' } },
+ },
+ },
+ {
+ name: 'elementContent',
+ children: { TextContent: { name: 'betweenTagsTokens' } },
+ },
+ {
+ name: 'elementContent',
+ children: {
+ Comment: { name: 'comment', children: { name: 'commentTokens' } },
+ },
+ },
+ {
+ name: 'elementContent',
+ children: { TextContent: { name: 'betweenTagsTokens' } },
+ },
+ {
+ name: 'elementContent',
+ children: {
+ Element: {
+ name: 'element',
+ children: {
+ OpenTagPartial: { name: 'openTagPartial' },
+ TextContent: { name: 'literalTagTokens' },
+ CloseTag: { name: 'closeTag' },
+ },
+ },
+ },
+ },
+ {
+ name: 'elementContent',
+ children: { TextContent: { name: 'betweenTagsTokens' } },
+ },
+ ],
+ });
+
+ expect(locations(node)).toStrictEqual({
+ start: 0,
+ end: 92,
+ children: [
+ {
+ start: 0,
+ end: 5,
+ children: { TextContent: { start: 0, end: 5 } },
+ },
+ {
+ start: 6,
+ end: 15,
+ children: {
+ Template: { start: 6, end: 15, children: { start: 9, end: 12 } },
+ },
+ },
+ {
+ start: 16,
+ end: 17,
+ children: { TextContent: { start: 16, end: 17 } },
+ },
+ {
+ start: 18,
+ end: 35,
+ children: {
+ Comment: { start: 18, end: 35, children: { start: 22, end: 32 } },
+ },
+ },
+ {
+ start: 36,
+ end: 37,
+ children: { TextContent: { start: 36, end: 37 } },
+ },
+ {
+ start: 38,
+ end: 86,
+ children: {
+ Element: {
+ start: 38,
+ end: 86,
+ children: {
+ OpenTagPartial: { start: 38, end: 42 },
+ TextContent: { start: 44, end: 79 },
+ CloseTag: { start: 80, end: 86 },
+ },
+ },
+ },
+ },
+ {
+ start: 87,
+ end: 92,
+ children: { TextContent: { start: 87, end: 92 } },
+ },
+ ],
+ });
+ });
+
+ test('all kinds of whitespaces', () => {
+ const input = `\t\n\r <\tdocument\t >\n\t {{ name }}\r\n\t \t\n`;
+ const { node } = withParser(input, (p) => p.root()) as { node: CstRootNode };
+
+ expect(images(node)).toStrictEqual([
+ { TextContent: '\t\n\r ' },
+ {
+ Element: {
+ OpenTagPartial: {
+ OpenBracket: '<',
+ WsAfterOpen: '\t',
+ TagName: 'document',
+ WsAfterAll: '\t ',
+ },
+ OpenTagCloseBracket: '>',
+ Content: [
+ { TextContent: '\n\t ' },
+ {
+ Template: {
+ TemplateOpen: '{{',
+ WsAfterOpen: ' ',
+ Content: ' name',
+ WsAfterContent: ' ',
+ TemplateClose: '}}',
+ },
+ },
+ { TextContent: '\r\n\t' },
+ ],
+ CloseTag: {
+ ClosingOpenBracket: '',
+ TagName: 'document',
+ CloseBracket: '>',
+ },
+ },
+ },
+ { TextContent: ' \t\n' },
+ ]);
+ });
+
+ test('single quotes vs double quotes edge cases', () => {
+ const input = `< div id='single' class="double" > {{ 'nested "quote"' }} div >`;
+ const { node } = withParser(input, (p) => p.root()) as { node: CstRootNode };
+
+ expect(images(node)).toStrictEqual({
+ Element: {
+ OpenTagPartial: {
+ OpenBracket: '<',
+ TagName: 'div',
+ Attribute: [
+ { AttributeKey: 'id', Equals: '=', quotedValue: { OpenQuote: "'", Content: 'single', CloseQuote: "'" } },
+ { AttributeKey: 'class', Equals: '=', quotedValue: { OpenQuote: '"', Content: 'double', CloseQuote: '"' } },
+ ],
+ WsBeforeEachAttribute: ' ',
+ WsAfterOpen: ' ',
+ WsAfterAll: ' ',
+ },
+ OpenTagCloseBracket: '>',
+ Content: [
+ { TextContent: ' ' },
+ {
+ Template: {
+ TemplateOpen: '{{',
+ WsAfterOpen: ' ',
+ Content: '\'nested "quote"\'',
+ WsAfterContent: ' ',
+ TemplateClose: '}}',
+ },
+ },
+ { TextContent: ' ' },
+ ],
+ CloseTag: {
+ ClosingOpenBracket: '',
+ TagName: 'div',
+ CloseBracket: '>',
+ WsAfterOpen: ' ',
+ WsBeforeClose: ' ',
+ },
+ },
+ });
+ });
+
+ test('empty quotes edge cases', () => {
+ const input = ``;
+ const { node } = withParser(input, (p) => p.root()) as { node: CstRootNode };
+
+ expect(images(node)).toStrictEqual({
+ Element: {
+ OpenTagPartial: {
+ OpenBracket: '<',
+ TagName: 'tag',
+ Attribute: [
+ { AttributeKey: 'attr1', Equals: '=', quotedValue: { OpenQuote: '"', CloseQuote: '"' } },
+ { AttributeKey: 'attr2', Equals: '=', quotedValue: { OpenQuote: "'", CloseQuote: "'" } },
+ ],
+ WsBeforeEachAttribute: ' ',
+ },
+ OpenTagCloseBracket: '>',
+ CloseTag: { ClosingOpenBracket: '', TagName: 'tag', CloseBracket: '>' },
+ },
+ });
+ });
+
+ test('matched text element with literal content', () => {
+ const input = `Hello {{ world }} and nested`;
+ const { node } = withParser(input, (p) => p.root()) as { node: CstRootNode };
+
+ expect(images(node)).toStrictEqual({
+ Element: {
+ OpenTagPartial: { OpenBracket: '<', TagName: 'text' },
+ OpenTagCloseBracket: '>',
+ TextContent: 'Hello {{ world }} and nested',
+ CloseTag: { ClosingOpenBracket: '', TagName: 'text', CloseBracket: '>' },
+ },
+ });
+ });
+
+ test('mismatched tags - text opening with template closing', () => {
+ const input = `Content here`;
+ const { node } = withParser(input, (p) => p.root()) as { node: CstRootNode };
+
+ expect(images(node)).toStrictEqual({
+ Element: {
+ CloseTag: { CloseBracket: '>', ClosingOpenBracket: '', TagName: 'text' },
+ OpenTagCloseBracket: '>',
+ OpenTagPartial: { OpenBracket: '<', TagName: 'text' },
+ TextContent: 'Content here',
+ },
+ });
+ });
+
+ test('completely unmatched tags should not error', () => {
+ const input = `contentmore
`;
+ const { node } = withParser(input, (p) => p.root()) as { node: CstRootNode };
+
+ expect(images(node)).toStrictEqual([
+ {
+ Element: {
+ OpenTagPartial: { OpenBracket: '<', TagName: 'document' },
+ OpenTagCloseBracket: '>',
+ Content: { TextContent: 'content' },
+ CloseTag: { ClosingOpenBracket: '', TagName: 'div', CloseBracket: '>' },
+ },
+ },
+ {
+ Element: {
+ OpenTagPartial: { OpenBracket: '<', TagName: 'span' },
+ OpenTagCloseBracket: '>',
+ Content: { TextContent: 'more' },
+ CloseTag: { ClosingOpenBracket: '', TagName: 'p', CloseBracket: '>' },
+ },
+ },
+ ]);
+ });
+
+ test('nested quoted templates with mixed quotes', () => {
+ const input = `'World'
`;
+ const { node } = withParser(input, (p) => p.root()) as { node: CstRootNode };
+
+ expect(images(node)).toStrictEqual({
+ Element: {
+ OpenTagPartial: {
+ OpenBracket: '<',
+ TagName: 'div',
+ WsBeforeEachAttribute: ' ',
+ Attribute: [
+ {
+ AttributeKey: 'title',
+ Equals: '=',
+ quotedValue: {
+ OpenQuote: '"',
+ Content: [
+ 'Hello ',
+ {
+ TemplateOpen: '{{',
+ WsAfterOpen: ' ',
+ Content: "'user'",
+ WsAfterContent: ' ',
+ TemplateClose: '}}',
+ },
+ ],
+ CloseQuote: '"',
+ },
+ },
+ {
+ AttributeKey: 'meta',
+ Equals: '=',
+ WsAfterEquals: ' ',
+ WsAfterKey: ' ',
+ quotedValue: {
+ CloseQuote: "'",
+ Content: [
+ '{if',
+ {
+ Content: "nothing''",
+ TemplateClose: '}}',
+ TemplateOpen: '{{',
+ WsAfterContent: ' ',
+ },
+ '123',
+ ],
+ OpenQuote: "'",
+ },
+ },
+ ],
+ },
+ OpenTagCloseBracket: '>',
+ Content: {
+ TextContent: "'World'",
+ },
+ CloseTag: { ClosingOpenBracket: '', TagName: 'div', CloseBracket: '>' },
+ },
+ });
+ });
+
+ test('special characters and symbols in content', () => {
+ const input = `@#$%^&*(){}[]|\\:";'<>?/.,~\``;
+ const { node } = withParser(input, (p) => p.root()) as { node: CstRootNode };
+
+ expect(images(node)).toStrictEqual({
+ Element: {
+ OpenTagPartial: { OpenBracket: '<', TagName: 'text' },
+ OpenTagCloseBracket: '>',
+ TextContent: '@#$%^&*(){}[]|\\:";\'<>?/.,~`',
+ CloseTag: { ClosingOpenBracket: '', TagName: 'text', CloseBracket: '>' },
+ },
+ });
+ });
+
+ test('multiple templates and elements mixed with whitespace', () => {
+ const input = ` {{ a }} {{ b }}
{{ c }} `;
+ const { node } = withParser(input, (p) => p.root()) as { node: CstRootNode };
+
+ expect(images(node)).toStrictEqual([
+ { TextContent: ' ' },
+ {
+ Template: {
+ TemplateOpen: '{{',
+ WsAfterOpen: ' ',
+ Content: 'a',
+ WsAfterContent: ' ',
+ TemplateClose: '}}',
+ },
+ },
+ { TextContent: ' ' },
+ {
+ Element: {
+ OpenTagPartial: { OpenBracket: '<', TagName: 'div' },
+ OpenTagCloseBracket: '>',
+ Content: {
+ Template: {
+ TemplateOpen: '{{',
+ WsAfterOpen: ' ',
+ Content: 'b',
+ WsAfterContent: ' ',
+ TemplateClose: '}}',
+ },
+ },
+ CloseTag: { ClosingOpenBracket: '', TagName: 'div', CloseBracket: '>' },
+ },
+ },
+ { TextContent: ' ' },
+ {
+ Template: {
+ TemplateOpen: '{{',
+ WsAfterOpen: ' ',
+ Content: 'c',
+ WsAfterContent: ' ',
+ TemplateClose: '}}',
+ },
+ },
+ { TextContent: ' ' },
+ ]);
+ });
+});
+
+describe('Error', () => {
+ test('orphan closing tags should error', () => {
+ const input = `Some text{{ template }}`;
+ const { node, errors } = withParser(input, (p) => p.root(), false) as { node: CstRootNode; errors: any[] };
+ expect(errors.length).toBe(4);
+
+ expect(images(node)).toStrictEqual([
+ { TextContent: 'Some text' },
+ { TextContent: 'orphan' },
+ {
+ Template: {
+ TemplateOpen: '{{',
+ WsAfterOpen: ' ',
+ Content: 'template',
+ WsAfterContent: ' ',
+ TemplateClose: '}}',
+ },
+ },
+ { TextContent: 'unknown' },
+ ]);
+ });
+
+ test('mismatched tags - template opening with text closing', () => {
+ const input = `Some content`;
+ const { node, errors } = withParser(input, (p) => p.root(), false) as { node: CstRootNode; errors: any[] };
+ expect(errors.length).toBe(1);
+
+ expect(images(node)).toStrictEqual({
+ Element: {
+ OpenTagPartial: { OpenBracket: '<', TagName: 'template' },
+ OpenTagCloseBracket: '>',
+ TextContent: 'Some content',
+ },
+ });
+
+ expect(names(node)).toStrictEqual({
+ name: 'root',
+ children: {
+ name: 'elementContent',
+ children: {
+ Element: {
+ name: 'element',
+ children: {
+ OpenTagPartial: { name: 'openTagPartial' },
+ TextContent: { name: 'literalTagTokens' },
+ CloseTag: { name: 'closeTag' },
+ },
+ },
+ },
+ },
+ });
+ });
+
+ test('empty template', () => {
+ const input = `{{ }}`;
+ const { node, errors } = withParser(input, (p) => p.root(), false) as { node: CstRootNode; errors: any[] };
+ expect(errors.length).toBe(2);
+ expect(images(node)).toStrictEqual({
+ Element: {
+ OpenTagPartial: {
+ OpenBracket: '<',
+ TagName: 'any',
+ WsBeforeEachAttribute: ' ',
+ Attribute: {
+ AttributeKey: 'foo',
+ Equals: '=',
+ templatedValue: { TemplateOpen: '{{', TemplateClose: '}}' },
+ },
+ },
+ OpenTagCloseBracket: '>',
+ Content: {
+ Template: { TemplateOpen: '{{', WsAfterOpen: ' ', TemplateClose: '}}' },
+ },
+ CloseTag: { ClosingOpenBracket: '', TagName: 'any', CloseBracket: '>' },
+ },
+ });
+ });
+});
+
+/* -------------------- tiny guards -------------------- */
+const isToken = (x: unknown): x is IToken => !!x && typeof (x as IToken).image === 'string';
+
+const isCstNode = (x: unknown): x is CstNode =>
+ !!x && typeof (x as any).name === 'string' && typeof (x as any).children === 'object';
+
+/* -------------------- core normalize -------------------- */
+/**
+ * Rules:
+ * - drop undefined
+ * - arrays: [] -> undefined; [x] -> x; [strings...] -> joined string; otherwise keep (with inner normalize)
+ * - objects: normalize recursively; if only key is "Content" -> unwrap value
+ */
+function normalizeAny(v: unknown): unknown {
+ if (v == null) {
+ return undefined;
+ }
+ if (Array.isArray(v)) {
+ return normalizeArray(v);
+ }
+ if (isToken(v) || isCstNode(v)) {
+ return v;
+ }
+ if (typeof v === 'object') {
+ return normalizeObject(v as Record);
+ }
+ return v;
+}
+
+function normalizeArray(arr: unknown[]): unknown {
+ const mapped = arr.map(normalizeAny).filter((v) => v !== undefined);
+
+ if (mapped.length === 0) {
+ return undefined;
+ }
+ if (mapped.every((x) => typeof x === 'string')) {
+ // concatenate pure string arrays
+ return (mapped as string[]).join('');
+ }
+ if (mapped.length === 1) {
+ return mapped[0];
+ }
+ return mapped;
+}
+
+function normalizeObject(obj: Record): unknown {
+ const out: Record = {};
+ for (const [k, v] of Object.entries(obj)) {
+ const nv = normalizeAny(v);
+ if (nv !== undefined) {
+ out[k] = nv;
+ }
+ }
+ const keys = Object.keys(out);
+ if (keys.length === 0) {
+ return undefined;
+ }
+ if (keys.length === 1 && keys[0] === 'Content') {
+ return out.Content;
+ }
+ return out;
+}
+
+function normalizeChildren(node: CstNode): unknown {
+ return normalizeObject(node.children as Record);
+}
+
+/* -------------------- generic transformer -------------------- */
+type Mode = 'images' | 'names' | 'locations';
+
+type Strategies = {
+ onToken(v: IToken): unknown; // what to emit for a token
+ onNodeWrap(n: CstNode, children: unknown): unknown; // how to wrap a CST node around its transformed children
+ keepChildKey(k: string, v: unknown): boolean; // allow pruning of token-only branches
+};
+
+function transformValue(val: unknown, S: Strategies): unknown {
+ if (val == null) {
+ return undefined;
+ }
+
+ if (isToken(val)) {
+ return S.onToken(val);
+ }
+
+ if (Array.isArray(val)) {
+ const mapped = val.map((x) => transformValue(x, S)).filter((x) => x !== undefined);
+ if (mapped.length === 0) {
+ return undefined;
+ }
+ if (mapped.every((x) => typeof x === 'string')) {
+ return (mapped as string[]).join('');
+ }
+ if (mapped.length === 1) {
+ return mapped[0];
+ }
+ return mapped;
+ }
+
+ if (isCstNode(val)) {
+ const norm = normalizeChildren(val);
+ const inner = transformValue(norm, S);
+ return S.onNodeWrap(val, inner);
+ }
+
+ if (typeof val === 'object') {
+ const out: Record = {};
+ for (const [k, v] of Object.entries(val)) {
+ const mv = transformValue(v, S);
+ if (mv !== undefined && S.keepChildKey(k, mv)) {
+ out[k] = mv;
+ }
+ }
+ const keys = Object.keys(out);
+ if (keys.length === 0) {
+ return undefined;
+ }
+ if (keys.length === 1 && keys[0] === 'Content') {
+ return out.Content;
+ }
+ return out;
+ }
+
+ // primitive fallback: pass through (lets string concatenation work if present)
+ return val;
+}
+
+/* -------------------- concrete modes -------------------- */
+
+// images(): leaves become strings; nested objects keyed by child names.
+// Token arrays get concatenated (via normalize/transform).
+export function images(node: CstNode): unknown {
+ const S: Strategies = {
+ onToken: (t) => t.image, // keep token text
+ onNodeWrap: (_n, children) => children, // node name not included; just the nested children map
+ keepChildKey: (_k, _v) => true, // keep everything
+ };
+ return transformValue(normalizeChildren(node), S);
+}
+
+// names(): only node names; omit token leaves entirely, but KEEP the full node tree.
+// If children collapse to an array/primitive, tuck under { Content: ... } so we don't lose the branch.
+export function names(node: CstNode): { name: string; children?: Record } {
+ const S: Strategies = {
+ onToken: (_t) => undefined, // drop token leaves
+ onNodeWrap: (n, children) => {
+ const out: { name: string; children?: Record | unknown[] } = { name: n.name };
+ if (children !== undefined) {
+ if (typeof children === 'object' && !Array.isArray(children)) {
+ // plain object: use as-is
+ const keys = Object.keys(children as Record);
+ if (keys.length) {
+ out.children = children as Record;
+ }
+ } else {
+ // array or primitive: wrap under Content
+ out.children = children as unknown[];
+ }
+ }
+ return out;
+ },
+ keepChildKey: (_k, v) => v !== undefined,
+ };
+ return transformValue(node, S) as any;
+}
+
+// locations(): node-level { start,end } only; omit token-level ranges.
+// Same "wrap under Content if not a plain object" rule to preserve shape.
+export function locations(node: CstNode): { start: number; end: number; children?: Record } {
+ const S: Strategies = {
+ onToken: (_t) => undefined, // drop token ranges
+ onNodeWrap: (n, children) => {
+ const base: { start: number; end: number; children?: Record | unknown[] } = {
+ start: n.location?.startOffset ?? 0,
+ end: n.location?.endOffset ?? 0,
+ };
+ if (children !== undefined) {
+ if (typeof children === 'object' && !Array.isArray(children)) {
+ const keys = Object.keys(children as Record);
+ if (keys.length) {
+ base.children = children as Record;
+ }
+ } else {
+ base.children = children as unknown[];
+ }
+ }
+ return base;
+ },
+ keepChildKey: (_k, v) => v !== undefined,
+ };
+ return transformValue(node, S) as any;
+}
diff --git a/packages/poml/tests/reader/lexer.test.ts b/packages/poml/tests/reader/lexer.test.ts
new file mode 100644
index 00000000..49d98d2c
--- /dev/null
+++ b/packages/poml/tests/reader/lexer.test.ts
@@ -0,0 +1,1323 @@
+import { describe, expect, test } from '@jest/globals';
+import {
+ extendedPomlLexer,
+ CommentOpen,
+ CommentClose,
+ TemplateOpen,
+ TemplateClose,
+ OpenBracket,
+ ClosingOpenBracket,
+ SelfCloseBracket,
+ CloseBracket,
+ Equals,
+ DoubleQuote,
+ SingleQuote,
+ Backslash,
+ Identifier,
+ Whitespace,
+ Arbitrary,
+ BackslashEscape,
+ CharacterEntity,
+ PragmaKeyword,
+} from 'poml/next/lexer';
+
+// Helper function to extract token images
+function tokenImages(input: string): string[] {
+ const result = extendedPomlLexer.tokenize(input);
+ return result.tokens.map((t) => t.image);
+}
+
+// Helper function to extract token types
+function tokenTypes(input: string): any[] {
+ const result = extendedPomlLexer.tokenize(input);
+ return result.tokens.map((t) => t.tokenType);
+}
+
+// Helper function to get full tokenization result
+function tokenize(input: string) {
+ return extendedPomlLexer.tokenize(input);
+}
+
+describe('Basic Token Images', () => {
+ test('should tokenize HTML comments', () => {
+ expect(tokenImages('')).toEqual(['']);
+ });
+
+ test('should tokenize template variables', () => {
+ expect(tokenImages('{{variable}}')).toEqual(['{{', 'variable', '}}']);
+ });
+
+ test('should tokenize XML tags', () => {
+ expect(tokenImages('')).toEqual(['<', 'task', '>']);
+ expect(tokenImages('')).toEqual(['', 'task', '>']);
+ expect(tokenImages('')).toEqual(['<', 'meta', ' ', '/>']);
+ });
+
+ test('should tokenize quotes and backslashes individually', () => {
+ expect(tokenImages('"hello"')).toEqual(['"', 'hello', '"']);
+ expect(tokenImages("'world'")).toEqual(["'", 'world', "'"]);
+ expect(tokenImages('text\\escape')).toEqual(['text', '\\', 'escape']);
+ });
+
+ test('should tokenize attributes', () => {
+ expect(tokenImages('id="value"')).toEqual(['id', '=', '"', 'value', '"']);
+ });
+
+ test('should tokenize whitespace', () => {
+ expect(tokenImages(' \t\n ')).toEqual([' \t\n ']);
+ });
+
+ test('should tokenize identifiers', () => {
+ expect(tokenImages('simple-name_123')).toEqual(['simple-name_123']);
+ });
+
+ test('should tokenize text content', () => {
+ expect(tokenImages('plain text here')).toEqual(['plain', ' ', 'text', ' ', 'here']);
+ });
+});
+
+describe('Edge Cases', () => {
+ test('should handle "abcdefghi"', () => {
+ expect(tokenImages('"abcdefghi"')).toEqual([
+ '"',
+ 'abc',
+ '<',
+ 'poml',
+ '>',
+ 'def',
+ '',
+ 'poml',
+ '>',
+ 'ghi',
+ '"',
+ ]);
+ });
+
+ test('should handle ghi', () => {
+ expect(tokenImages('ghi')).toEqual([
+ '<',
+ 'poml',
+ ' ',
+ 'abc',
+ '=',
+ '"',
+ 'def',
+ '"',
+ '>',
+ 'ghi',
+ '',
+ 'poml',
+ '>',
+ ]);
+ });
+
+ test('should handle mixed content', () => {
+ expect(tokenImages('text {{var}} more')).toEqual(['text', ' ', '{{', 'var', '}}', ' ', 'more']);
+ });
+
+ test('chinese characters', () => {
+ expect(tokenImages('中文 {{ 文本 }}内容< 标签>')).toEqual([
+ '中文',
+ ' ',
+ '{{',
+ ' ',
+ '文本',
+ ' ',
+ '}}',
+ '内容',
+ '<',
+ ' ',
+ '标签',
+ '>',
+ ]);
+ });
+
+ test('should handle complex attributes', () => {
+ expect(tokenImages('')).toEqual([
+ '<',
+ 'task',
+ ' ',
+ 'id',
+ '=',
+ '"',
+ '{{',
+ 'value',
+ '}}',
+ '"',
+ ' ',
+ 'class',
+ '=',
+ '"',
+ 'test',
+ '"',
+ '>',
+ ]);
+ });
+
+ test('should handle escaped quotes', () => {
+ expect(tokenImages('text "with \\"escaped\\" quotes"')).toEqual([
+ 'text',
+ ' ',
+ '"',
+ 'with',
+ ' ',
+ '\\"',
+ 'escaped',
+ '\\"',
+ ' ',
+ 'quotes',
+ '"',
+ ]);
+ });
+
+ test('should handle complex real-world scenarios', () => {
+ const realWorldTests = [
+ `
+
+
+ {{page.title}}
+
+
+
+
+ {{content}}
+
+
+`,
+
+ `
+ {{description}}
+
+`,
+
+ `"Complex string with {{variables}} and inside"`,
+
+ `{{#each items}}
+
+ {{name}}
+
+{{/each}}`,
+ ];
+
+ realWorldTests.forEach((test) => {
+ const result = tokenize(test);
+ expect(result.errors).toHaveLength(0);
+ expect(result.tokens.length).toBeGreaterThan(0);
+
+ // Verify position integrity
+ result.tokens.forEach((token) => {
+ expect(token.startOffset).toBeGreaterThanOrEqual(0);
+ expect(token.endOffset).toBeGreaterThanOrEqual(token.startOffset!);
+ });
+ });
+ });
+
+ test('should handle equals sign in various contexts', () => {
+ const equalsTests = [
+ 'attr=value',
+ 'attr="value"',
+ "attr='value'",
+ 'attr={{value}}',
+ 'first=one second=two',
+ '=standalone',
+ 'text=content',
+ 'a=b=c',
+ ];
+
+ equalsTests.forEach((test) => {
+ const result = tokenize(test);
+ expect(result.errors).toHaveLength(0);
+
+ const equalsTokens = result.tokens.filter((t) => t.tokenType.name === 'Equals');
+ expect(equalsTokens.length).toBeGreaterThan(0);
+ });
+ });
+
+ test('should handle edge cases with zero-length matches', () => {
+ const edgeCases = ['', ' ', '\n', '\t', '\r', '{{}}', '', '<>', '""', "''", '\\'];
+
+ edgeCases.forEach((test) => {
+ const result = tokenize(test);
+ expect(result.errors).toHaveLength(0);
+
+ if (test === '') {
+ expect(result.tokens).toHaveLength(0);
+ } else {
+ expect(result.tokens.length).toBeGreaterThan(0);
+ }
+ });
+ });
+
+ // Added by claude
+ test('should handle comment-like sequences in different contexts', () => {
+ // The pattern text')).toEqual(['')).toEqual(['']); // Dashes in content
+
+ // Edge case: comment opener followed immediately by closer
+ expect(tokenImages('')).toEqual(['']);
+ expect(tokenImages('')).toEqual(['']); // Four dashes then close
+ });
+
+ test('should handle backslash escapes at token boundaries correctly', () => {
+ // BackslashEscape pattern could conflict with regular Backslash
+ expect(tokenImages('\\n')).toEqual(['\\n']); // Valid escape
+ expect(tokenTypes('\\n')).toEqual([BackslashEscape]);
+
+ expect(tokenImages('\\q')).toEqual(['\\', 'q']); // Invalid escape
+ expect(tokenTypes('\\q')).toEqual([Backslash, Identifier]);
+
+ // Hex escapes at boundaries
+ expect(tokenImages('\\x4')).toEqual(['\\', 'x4']); // Incomplete hex (needs 2 digits)
+ expect(tokenImages('\\x4G')).toEqual(['\\', 'x4G']); // Invalid hex char
+ expect(tokenImages('\\xGG')).toEqual(['\\', 'xGG']); // No valid hex digits
+
+ // Unicode escapes with wrong digit count
+ expect(tokenImages('\\u123')).toEqual(['\\', 'u123']); // Too few digits (needs 4)
+ expect(tokenImages('\\u12345')).toEqual(['\\u1234', '5']); // Too many for \u
+ expect(tokenImages('\\U1234567')).toEqual(['\\', 'U1234567']); // Too few for \U (needs 8)
+ expect(tokenImages('\\U123456789')).toEqual(['\\U12345678', '9']); // Too many for \U
+
+ // Template brace escapes
+ expect(tokenImages('\\{{')).toEqual(['\\{{']); // Valid escape
+ expect(tokenImages('\\}}')).toEqual(['\\}}']); // Valid escape
+ expect(tokenImages('\\{')).toEqual(['\\', '{']); // Invalid - single brace
+ expect(tokenImages('\\}')).toEqual(['\\', '}']); // Invalid - single brace
+ });
+
+ test('should handle Arbitrary token lookahead constraints correctly', () => {
+ // The Arbitrary pattern has complex lookahead constraints for braces and slashes
+
+ // Single braces should be part of Arbitrary when not followed by same brace
+ expect(tokenImages('{a')).toEqual(['{a']);
+ expect(tokenTypes('{a')).toEqual([Arbitrary]);
+
+ expect(tokenImages('}b')).toEqual(['}b']);
+ expect(tokenTypes('}b')).toEqual([Arbitrary]);
+
+ // But double braces should be template markers
+ expect(tokenImages('{{a')).toEqual(['{{', 'a']);
+ expect(tokenTypes('{{a')).toEqual([TemplateOpen, Identifier]);
+
+ // Mixed scenarios
+ expect(tokenImages('a{b}c')).toEqual(['a', '{b}c']);
+ expect(tokenTypes('a{b}c')).toEqual([Identifier, Arbitrary]);
+
+ // Slash constraints
+ expect(tokenImages('a/b')).toEqual(['a', '/b']);
+ expect(tokenTypes('a/b')).toEqual([Identifier, Arbitrary]);
+
+ expect(tokenImages('a/>b')).toEqual(['a', '/>', 'b']);
+ expect(tokenTypes('a/>b')).toEqual([Identifier, SelfCloseBracket, Identifier]);
+
+ // Dash constraints (should not consume dashes that could start comment close)
+ expect(tokenImages('text--')).toEqual(['text--']);
+ expect(tokenImages('text---')).toEqual(['text---']);
+ expect(tokenImages('text-->')).toEqual(['text', '-->']);
+ expect(tokenImages('text--->')).toEqual(['text', '--->']);
+ });
+
+ test('should handle all character entity edge cases', () => {
+ // Valid entities
+ expect(tokenImages('&')).toEqual(['&']);
+ expect(tokenTypes('&')).toEqual([CharacterEntity]);
+
+ expect(tokenImages('{')).toEqual(['{']);
+ expect(tokenTypes('{')).toEqual([CharacterEntity]);
+
+ expect(tokenImages('ꯍ')).toEqual(['ꯍ']);
+ expect(tokenTypes('ꯍ')).toEqual([CharacterEntity]);
+
+ // Edge case: empty entity &;
+ expect(tokenImages('&;')).toEqual(['&;']);
+ expect(tokenTypes('&;')).toEqual([CharacterEntity]); // Pattern includes &;
+
+ // Invalid entities should NOT match
+ expect(tokenImages('&')).toEqual(['&']);
+ expect(tokenTypes('&')).toEqual([Arbitrary]);
+
+ expect(tokenImages('&abc')).toEqual(['&abc']); // Missing semicolon
+ expect(tokenTypes('&abc')).toEqual([Arbitrary]);
+
+ expect(tokenImages('')).toEqual(['']); // Incomplete numeric
+ expect(tokenTypes('')).toEqual([Arbitrary]);
+
+ expect(tokenImages('')).toEqual(['']); // Incomplete hex
+ expect(tokenTypes('')).toEqual([Arbitrary]);
+
+ // Entities in context
+ expect(tokenImages('a&b')).toEqual(['a', '&', 'b']);
+ expect(tokenImages('&<>')).toEqual(['&', '<', '>']);
+ });
+
+ // 5. Test for token precedence and order conflicts
+ test('should respect token precedence in ambiguous cases', () => {
+ // ClosingOpenBracket must come before OpenBracket
+ expect(tokenImages('')).toEqual(['']);
+ expect(tokenTypes('')).toEqual([ClosingOpenBracket]);
+
+ expect(tokenImages('<')).toEqual(['<']);
+ expect(tokenTypes('<')).toEqual([OpenBracket]);
+
+ // SelfCloseBracket must come before CloseBracket
+ expect(tokenImages('/>')).toEqual(['/>']);
+ expect(tokenTypes('/>')).toEqual([SelfCloseBracket]);
+
+ expect(tokenImages('>')).toEqual(['>']);
+ expect(tokenTypes('>')).toEqual([CloseBracket]);
+
+ // BackslashEscape must come before Backslash
+ expect(tokenImages('\\n')).toEqual(['\\n']);
+ expect(tokenTypes('\\n')).toEqual([BackslashEscape]);
+
+ expect(tokenImages('\\z')).toEqual(['\\', 'z']);
+ expect(tokenTypes('\\z')).toEqual([Backslash, Identifier]);
+
+ // Identifier pattern with special chars
+ expect(tokenImages('a-b')).toEqual(['a-b']); // Dash allowed in identifier
+ expect(tokenImages('a--b')).toEqual(['a--b']); // Double dash allowed
+ expect(tokenImages('a---b')).toEqual(['a---b']); // Triple dash allowed
+ expect(tokenImages('a-->')).toEqual(['a', '-->']); // But not before >
+ expect(tokenImages('a--->')).toEqual(['a', '--->']); // Comment close takes precedence
+
+ // Identifier with dots and colons
+ expect(tokenImages('ns:tag.name')).toEqual(['ns:tag.name']);
+ expect(tokenTypes('ns:tag.name')).toEqual([Identifier]);
+
+ // PragmaKeyword tests
+ expect(tokenImages('@pragma')).toEqual(['@pragma']);
+ expect(tokenTypes('@pragma')).toEqual([PragmaKeyword]);
+ expect(tokenImages('-- @pragma')).toEqual(['--', ' ', '@pragma']);
+ expect(tokenTypes('-- @pragma')).toEqual([Arbitrary, Whitespace, PragmaKeyword]);
+ expect(tokenTypes('--@pragma')).toEqual([Arbitrary]);
+ expect(tokenImages('')).toEqual([CommentOpen, Whitespace, Identifier, Whitespace, CommentClose]);
+ });
+
+ test('should identify whitespace', () => {
+ expect(tokenTypes(' \t\n ')).toEqual([Whitespace]);
+ });
+
+ test('should identify attributes', () => {
+ expect(tokenTypes('')).toEqual([
+ OpenBracket,
+ Identifier,
+ Whitespace,
+ Identifier,
+ Equals,
+ DoubleQuote,
+ Identifier,
+ DoubleQuote,
+ Whitespace,
+ Identifier,
+ Equals,
+ DoubleQuote,
+ Arbitrary,
+ BackslashEscape,
+ DoubleQuote,
+ Arbitrary,
+ SingleQuote,
+ CloseBracket,
+ ]);
+ });
+
+ test('recognizes simple escapes', () => {
+ expect(tokenTypes('"a\\nb"')).toEqual([DoubleQuote, Identifier, BackslashEscape, Identifier, DoubleQuote]);
+
+ expect(tokenTypes("'a\\tb'")).toEqual([SingleQuote, Identifier, BackslashEscape, Identifier, SingleQuote]);
+
+ // Escaped quotes and backslash
+ expect(tokenTypes('"\\\" \\\\"')).toEqual([DoubleQuote, BackslashEscape, Whitespace, BackslashEscape, DoubleQuote]);
+ });
+
+ test('recognizes unicode and hex escapes', () => {
+ expect(tokenTypes('"A: \\x41"')).toEqual([
+ DoubleQuote,
+ Identifier, // A:
+ Whitespace,
+ BackslashEscape, // \x41
+ DoubleQuote,
+ ]);
+
+ expect(tokenTypes('"U: \\u0041"')).toEqual([
+ DoubleQuote,
+ Identifier, // U:
+ Whitespace,
+ BackslashEscape, // \u0041
+ DoubleQuote,
+ ]);
+
+ expect(tokenTypes('"emoji: \\U0001F600"')).toEqual([
+ DoubleQuote,
+ Identifier, // emoji:
+ Whitespace,
+ BackslashEscape, // \U0001F600
+ DoubleQuote,
+ ]);
+ });
+
+ test('recognizes escaped braces for templates', () => {
+ expect(tokenImages('pre \\{{ mid \\}} post')).toEqual(['pre', ' ', '\\{{', ' ', 'mid', ' ', '\\}}', ' ', 'post']);
+ expect(tokenTypes('pre \\{{ mid \\}} post')).toEqual([
+ Identifier,
+ Whitespace,
+ BackslashEscape,
+ Whitespace,
+ Identifier,
+ Whitespace,
+ BackslashEscape,
+ Whitespace,
+ Identifier,
+ ]);
+ });
+
+ test('invalid escapes fall back to Backslash + text', () => {
+ expect(tokenImages('"\\q"')).toEqual(['"', '\\', 'q', '"']);
+ expect(tokenTypes('"\\q"')).toEqual([DoubleQuote, Backslash, Identifier, DoubleQuote]);
+
+ // Incomplete hex/unicode
+ expect(tokenImages('"\\x4"')).toEqual(['"', '\\', 'x4', '"']);
+ expect(tokenTypes('"\\x4"')).toEqual([DoubleQuote, Backslash, Identifier, DoubleQuote]);
+
+ expect(tokenImages('"\\u123"')).toEqual(['"', '\\', 'u123', '"']);
+ expect(tokenTypes('"\\u123"')).toEqual([DoubleQuote, Backslash, Identifier, DoubleQuote]);
+ });
+
+ test('recognizes decimal, hex, and named entities', () => {
+ expect(tokenImages('Fish & Chips')).toEqual(['Fish', ' ', '&', ' ', 'Chips']);
+ expect(tokenTypes('Fish & Chips')).toEqual([Identifier, Whitespace, CharacterEntity, Whitespace, Identifier]);
+
+ expect(tokenImages('Hex: A Dec: A')).toEqual(['Hex:', ' ', 'A', ' ', 'Dec:', ' ', 'A']);
+ const types = tokenTypes('Hex: A Dec: A');
+ expect(types).toContain(CharacterEntity);
+ });
+
+ test('does not match invalid or incomplete entities', () => {
+ // Missing semicolon or bare ampersand should not be CharacterEntity
+ expect(tokenImages('A & B')).toEqual(['A', ' ', '&', ' ', 'B']);
+ const types = tokenTypes('A & B');
+ expect(types).not.toContain(CharacterEntity);
+
+ expect(tokenImages('Bad: &abc more')).toEqual(['Bad:', ' ', '&abc', ' ', 'more']);
+ expect(tokenTypes('Bad: &abc more')).not.toContain(CharacterEntity);
+ });
+
+ test('allows dot, colon, and hyphen', () => {
+ expect(tokenImages('')).toEqual([
+ '<',
+ 'xml:tag.name',
+ ' ',
+ 'data-value',
+ '=',
+ '"',
+ 'x',
+ '"',
+ '>',
+ ]);
+ const types = tokenTypes('');
+ expect(types[1]).toBe(Identifier);
+ expect(types[3]).toBe(Identifier);
+ });
+
+ test('stops before comment close sequence', () => {
+ // Identifier should not consume the leading '-' that starts a comment close
+ expect(tokenImages('name--->')).toEqual(['name', '--->']);
+ expect(tokenTypes('name--->')).toEqual([Identifier, CommentClose]);
+ });
+
+ test('ASCII whitespace groups into Whitespace token', () => {
+ const ws = ' \t\n\r\v\f ';
+ expect(tokenTypes(ws)).toEqual([Whitespace]);
+ expect(tokenImages(ws)).toEqual([ws]);
+ });
+
+ test('Unicode whitespace is not Whitespace', () => {
+ const nbsp = '\u00A0';
+ const emsp = '\u2003';
+ const ideographic = '\u3000';
+
+ // Single unicode spaces should be Arbitrary tokens
+ expect(tokenTypes(nbsp)).toEqual([Arbitrary]);
+ expect(tokenImages(nbsp)).toEqual(['\u00A0']);
+
+ expect(tokenTypes(emsp)).toEqual([Arbitrary]);
+ expect(tokenImages(emsp)).toEqual(['\u2003']);
+
+ expect(tokenTypes(ideographic)).toEqual([Arbitrary]);
+ expect(tokenImages(ideographic)).toEqual(['\u3000']);
+
+ // Mixed ASCII + Unicode whitespace keeps boundaries
+ expect(tokenImages('a ' + '\u2003' + ' b')).toEqual(['a', ' ', '\u2003', ' ', 'b']);
+ expect(tokenTypes('a ' + '\u2003' + ' b')).toEqual([Identifier, Whitespace, Arbitrary, Whitespace, Identifier]);
+ });
+
+ test('single braces and invalid ampersands are Arbitrary', () => {
+ expect(tokenTypes('{')).toEqual([Arbitrary]);
+ expect(tokenTypes('}')).toEqual([Arbitrary]);
+ expect(tokenTypes('&')).toEqual([Arbitrary]);
+ expect(tokenImages('&;')).toEqual(['&;']);
+ expect(tokenTypes('&;')).toEqual([CharacterEntity]);
+ expect(tokenImages('&z;')).toEqual(['&z;']); // still a CharacterEntity-like name by pattern
+ expect(tokenTypes('&z;')).toEqual([CharacterEntity]);
+ });
+
+ test('slash not followed by > stays in Arbitrary', () => {
+ expect(tokenImages('a/b')).toEqual(['a', '/b']);
+ expect(tokenTypes('a/b')).toEqual([Identifier, Arbitrary]);
+ });
+});
+
+describe('Source Position and Error Tests', () => {
+ test('should provide correct source positions', () => {
+ const result = tokenize('content');
+ expect(result.errors).toHaveLength(0);
+
+ const tokens = result.tokens;
+ expect(tokens[0].startOffset).toBe(0);
+ expect(tokens[0].endOffset).toBe(0);
+ expect(tokens[0].image).toBe('<');
+
+ expect(tokens[1].startOffset).toBe(1);
+ expect(tokens[1].endOffset).toBe(4);
+ expect(tokens[1].image).toBe('task');
+
+ expect(tokens[2].startOffset).toBe(5);
+ expect(tokens[2].endOffset).toBe(5);
+ expect(tokens[2].image).toBe('>');
+ });
+
+ test('should handle line and column tracking', () => {
+ const input = `line1
+line2
+line3`;
+ const result = tokenize(input);
+
+ const tagToken = result.tokens.find((t) => t.tokenType === OpenBracket);
+ expect(tagToken).toBeDefined();
+ expect(tagToken!.startLine).toBe(2);
+ expect(tagToken!.startColumn).toBe(7); // After "line2 "
+ });
+
+ test('should handle malformed input gracefully', () => {
+ const result = tokenize('content');
+ const sortedTokens = [...result.tokens].sort((a, b) => a.startOffset - b.startOffset);
+
+ for (let i = 0; i < sortedTokens.length - 1; i++) {
+ const current = sortedTokens[i];
+ const next = sortedTokens[i + 1];
+ expect(current.endOffset).toBeLessThanOrEqual(next.startOffset);
+ }
+ });
+
+ test('should handle empty input', () => {
+ const result = tokenize('');
+ expect(result.errors).toHaveLength(0);
+ expect(result.tokens).toHaveLength(0);
+ });
+
+ test('should handle whitespace only input', () => {
+ const result = tokenize(' \t\n ');
+ expect(result.errors).toHaveLength(0);
+ expect(result.tokens).toHaveLength(1);
+ expect(result.tokens[0].tokenType).toBe(Whitespace);
+ });
+});
+
+describe('Complex Mixed Content', () => {
+ test('should handle extended POML specification example', () => {
+ const input = `# My Analysis
+
+
+Analyze data
+
+
+{{variable}}`;
+
+ const images = tokenImages(input);
+ expect(images).toContain('#');
+ expect(images).toContain('My');
+ expect(images).toContain('Analysis');
+ expect(images).toContain('<');
+ expect(images).toContain('task');
+ expect(images).toContain('>');
+ expect(images).toContain('{{');
+ expect(images).toContain('variable');
+ expect(images).toContain('}}');
+ });
+
+ test('should handle comments with mixed content', () => {
+ expect(tokenImages('content')).toEqual([
+ '',
+ '<',
+ 'task',
+ '>',
+ 'content',
+ '',
+ 'task',
+ '>',
+ ]);
+ });
+
+ test('should handle nested quotes and templates', () => {
+ expect(tokenImages('')).toEqual([
+ '<',
+ 'meta',
+ ' ',
+ 'value',
+ '=',
+ '"',
+ '{{',
+ 'path',
+ '}}',
+ '/file.txt',
+ '"',
+ '>',
+ ]);
+ });
+});
+
+describe('Boundary Conditions', () => {
+ test('should handle single character inputs', () => {
+ expect(tokenize('<').tokens).toHaveLength(1);
+ expect(tokenize('>').tokens).toHaveLength(1);
+ expect(tokenize('"').tokens).toHaveLength(1);
+ expect(tokenize("'").tokens).toHaveLength(1);
+ expect(tokenize('\\').tokens).toHaveLength(1);
+ expect(tokenize('=').tokens).toHaveLength(1);
+ expect(tokenize(' ').tokens).toHaveLength(1);
+ expect(tokenize('\t').tokens).toHaveLength(1);
+ expect(tokenize('\n').tokens).toHaveLength(1);
+ expect(tokenize('a').tokens).toHaveLength(1);
+ expect(tokenize('_').tokens).toHaveLength(1);
+ expect(tokenize('1').tokens).toHaveLength(1);
+ expect(tokenize('@').tokens).toHaveLength(1);
+ });
+
+ test('should handle two character edge cases', () => {
+ expect(tokenImages('{{')).toEqual(['{{']);
+ expect(tokenImages('}}')).toEqual(['}}']);
+ expect(tokenImages('')).toEqual(['']);
+ expect(tokenImages('/>')).toEqual(['/>']);
+ expect(tokenImages('{}')).toEqual(['{}']);
+ expect(tokenImages('}{')).toEqual(['}{']);
+ expect(tokenImages('""')).toEqual(['"', '"']);
+ expect(tokenImages("''")).toEqual(["'", "'"]);
+ expect(tokenImages('<>')).toEqual(['<', '>']);
+ });
+
+ test('should handle minimum valid patterns', () => {
+ expect(tokenImages('')).toEqual(['']);
+ expect(tokenImages('')).toEqual(['<', 'a', '>']);
+ expect(tokenImages('')).toEqual(['', 'a', '>']);
+ expect(tokenImages('')).toEqual(['<', 'a', '/>']);
+ });
+
+ test('should handle very long inputs without crashes', () => {
+ const longText = 'a'.repeat(100000);
+ const result = tokenize(longText);
+ expect(result.errors).toHaveLength(0);
+ expect(result.tokens).toHaveLength(1);
+ expect(result.tokens[0].image).toBe(longText);
+
+ const longComment = ``;
+ const commentResult = tokenize(longComment);
+ expect(commentResult.errors).toHaveLength(0);
+ expect(commentResult.tokens).toHaveLength(3);
+
+ const longIdentifier = 'a' + 'b'.repeat(10000);
+ const identifierResult = tokenize(longIdentifier);
+ expect(identifierResult.errors).toHaveLength(0);
+ expect(identifierResult.tokens).toHaveLength(1);
+ });
+
+ test('should handle maximum practical complexity', () => {
+ const complexInput =
+ '<' +
+ 'tag'.repeat(1000) +
+ ' attr="' +
+ 'value'.repeat(1000) +
+ '">' +
+ 'content'.repeat(1000) +
+ '' +
+ 'tag'.repeat(1000) +
+ '>';
+ const result = tokenize(complexInput);
+ expect(result.errors).toHaveLength(0);
+ expect(result.tokens).toHaveLength(13);
+ });
+
+ test('should handle deeply nested structures', () => {
+ let nested = '';
+ for (let i = 0; i < 100; i++) {
+ nested += ``;
+ }
+ nested += 'content';
+ for (let i = 99; i >= 0; i--) {
+ nested += ``;
+ }
+ const result = tokenize(nested);
+ expect(result.errors).toHaveLength(0);
+ expect(result.tokens).toHaveLength(601);
+ });
+});
+
+describe('Unicode and Special Characters', () => {
+ test('should handle CJK characters', () => {
+ expect(tokenImages('你好世界')).toEqual(['你好世界']);
+ expect(tokenImages('こんにちは')).toEqual(['こんにちは']);
+ expect(tokenImages('안녕하세요')).toEqual(['안녕하세요']);
+ });
+
+ test('should handle emoji and symbols', () => {
+ expect(tokenImages('Hello 👋 World 🌍')).toEqual(['Hello', ' ', '👋', ' ', 'World', ' ', '🌍']);
+ expect(tokenImages('Math: ∑∞π≠∅')).toEqual(['Math:', ' ', '∑∞π≠∅']);
+ expect(tokenImages('Arrows: ←→↑↓')).toEqual(['Arrows:', ' ', '←→↑↓']);
+ });
+
+ test('should handle unicode', () => {
+ expect(tokenImages('<こんにちは>')).toEqual(['<', 'こんにちは', '>']);
+ expect(tokenImages('{{你好}}')).toEqual(['{{', '你好', '}}']);
+ expect(tokenImages('')).toEqual(['<', 'tag', ' ', 'attr', '=', '"', 'caf', 'é', '"', '>']);
+ });
+
+ test('should maintain lexer stability with all edge cases', () => {
+ // Combination of many edge cases
+ const stressTest =
+ '\uFEFF\x00\x01\x02\x03\x07content\x08\x09\x0A';
+
+ const result = tokenize(stressTest);
+ expect(result.tokens.length).toBeGreaterThan(0);
+
+ // Verify token integrity
+ result.tokens.forEach((token) => {
+ expect(token.startOffset).toBeGreaterThanOrEqual(0);
+ if (token.endOffset !== undefined) {
+ expect(token.endOffset).toBeGreaterThanOrEqual(token.startOffset);
+ }
+ });
+ });
+});
+
+describe('Malformed Patterns', () => {
+ test('should handle incomplete comments', () => {
+ expect(tokenize('')).toEqual(['']);
+ expect(tokenImages('')).toEqual(['']);
+ expect(tokenImages('more{{ content')).toEqual([
+ '',
+ '<',
+ 'tag',
+ '>',
+ 'more',
+ '{{',
+ ' ',
+ 'content',
+ ]);
+ expect(tokenImages("\"quoted textend")).toEqual([
+ '"',
+ 'quoted',
+ ' ',
+ 'text',
+ '<',
+ 'tag',
+ ' ',
+ 'attr',
+ '=',
+ "'",
+ 'mixed',
+ "'",
+ '>',
+ 'end',
+ ]);
+ });
+
+ test('should handle lookahead boundary cases for single braces', () => {
+ expect(tokenImages('{nottemplate}')).toEqual(['{nottemplate}']);
+ expect(tokenImages('}notclosing{')).toEqual(['}notclosing{']);
+ expect(tokenImages('text{more}text')).toEqual(['text', '{more}text']);
+ expect(tokenImages('before}after')).toEqual(['before', '}after']);
+ expect(tokenImages('before{after')).toEqual(['before', '{after']);
+ expect(tokenImages('text } { more')).toEqual(['text', ' ', '}', ' ', '{', ' ', 'more']);
+ });
+
+ test('should handle greedy vs non-greedy matching', () => {
+ expect(tokenImages('')).toEqual(['', '']);
+ expect(tokenImages('{{first}}{{second}}')).toEqual(['{{', 'first', '}}', '{{', 'second', '}}']);
+ expect(tokenImages('textmore')).toEqual(['text', '', 'more']);
+ });
+
+ test('should handle single braces correctly', () => {
+ // Single { or } are OK if not followed by another brace
+ expect(tokenImages('text { more text')).toEqual(['text', ' ', '{', ' ', 'more', ' ', 'text']);
+ expect(tokenImages('text } more text')).toEqual(['text', ' ', '}', ' ', 'more', ' ', 'text']);
+ expect(tokenImages('a{b}c')).toEqual(['a', '{b}c']);
+ expect(tokenImages('path{index}')).toEqual(['path', '{index}']);
+ expect(tokenImages('array[{key}]')).toEqual(['array', '[{key}]']);
+ expect(tokenImages('{ not {{ double')).toEqual(['{', ' ', 'not', ' ', '{{', ' ', 'double']);
+ expect(tokenImages('} not }} double')).toEqual(['}', ' ', 'not', ' ', '}}', ' ', 'double']);
+ expect(tokenImages('{}empty{}')).toEqual(['{}empty{}']);
+ expect(tokenImages('}{reversed}{')).toEqual(['}{reversed}{']);
+ });
+
+ test('should handle incomplete tag delimiters', () => {
+ // Incomplete tag delimiters such as / (except /< and />)
+ expect(tokenImages('path/to/file')).toEqual(['path', '/to/file']);
+ expect(tokenImages('a/b/c')).toEqual(['a', '/b/c']);
+ expect(tokenImages('text / more')).toEqual(['text', ' ', '/', ' ', 'more']);
+ expect(tokenImages('http://example.com')).toEqual(['http:', '//example.com']);
+ expect(tokenImages('5/3=1.67')).toEqual(['5/3', '=', '1.67']);
+ // These should NOT match as incomplete delimiters
+ expect(tokenImages('/')).toEqual(['/', '<', 'tag', '>']);
+ expect(tokenImages('/>')).toEqual(['/>']);
+ expect(tokenImages('')).toEqual(['', 'tag', '>']);
+ });
+
+ test('should handle incomplete comment delimiters', () => {
+ // Incomplete comment delimiters such as !-- or -- are OK
+ expect(tokenImages('text !-- not comment')).toEqual(['text', ' ', '!--', ' ', 'not', ' ', 'comment']);
+ expect(tokenImages('text -- also not')).toEqual(['text', ' ', '--', ' ', 'also', ' ', 'not']);
+ expect(tokenImages('a--b')).toEqual(['a--b']);
+ expect(tokenImages('!--incomplete')).toEqual(['!--incomplete']);
+ expect(tokenImages('--dashes--')).toEqual(['--dashes--']);
+ expect(tokenImages('')).toEqual([
+ '',
+ ]);
+ expect(tokenImages('not')).toEqual(['not', '']);
+ expect(tokenImages('---triple-dash')).toEqual(['---triple-dash']);
+ expect(tokenImages('text --- more')).toEqual(['text', ' ', '---', ' ', 'more']);
+ });
+
+ test('should handle incorrect @pragma directives', () => {
+ // Incorrect @pragma directive such as @pragm or @pragmaX will be matched as Arbitrary
+ expect(tokenImages('@pragma')).toEqual(['@pragma']);
+ expect(tokenImages('@pragm')).toEqual(['@pragm']);
+ expect(tokenImages('@pragmaX')).toEqual(['@pragma', 'X']);
+ expect(tokenImages('@pragma-extended')).toEqual(['@pragma', '-extended']);
+ expect(tokenImages('@@pragma')).toEqual(['@@pragma']);
+ expect(tokenImages('not@pragma')).toEqual(['not', '@pragma']);
+ expect(tokenImages('@PRAGMA')).toEqual(['@PRAGMA']);
+ expect(tokenImages('@Pragma')).toEqual(['@Pragma']);
+ expect(tokenImages('@pragma key=value')).toEqual(['@pragma', ' ', 'key', '=', 'value']);
+ });
+
+ test('should handle >', () => {
+ expect(tokenImages('>')).toEqual(['', '>']);
+ expect(tokenImages('< />')).toEqual(['<', ' ', '/>']);
+ expect(tokenImages('< / >')).toEqual(['<', ' ', '/', ' ', '>']);
+ expect(tokenImages('/>')).toEqual(['', '/>']);
+ });
+});
+
+describe('Position Tracking Accuracy', () => {
+ test('should track positions accurately across multiple lines', () => {
+ const input = `line1
+content
+{{variable}}
+final line`;
+ const result = tokenize(input);
+
+ const tagOpenToken = result.tokens.find((t) => t.image === '<' && t.startLine === 2);
+ expect(tagOpenToken).toBeDefined();
+ expect(tagOpenToken!.startColumn).toBe(1);
+
+ const variableToken = result.tokens.find((t) => t.image === 'variable');
+ expect(variableToken).toBeDefined();
+ expect(variableToken!.startLine).toBe(3);
+ });
+
+ test('should track positions accurately with mixed line endings', () => {
+ const input = 'line1\r\nline2\nline3\r';
+ const result = tokenize(input);
+
+ expect(result.tokens.length).toBeGreaterThan(0);
+ result.tokens.forEach((token) => {
+ expect(token.startOffset).toBeGreaterThanOrEqual(0);
+ expect(token.endOffset).toBeGreaterThanOrEqual(token.startOffset!);
+ expect(token.startLine).toBeGreaterThanOrEqual(1);
+ expect(token.startColumn).toBeGreaterThanOrEqual(1);
+ });
+ });
+
+ test('should handle position tracking with empty tokens', () => {
+ const input = '<>""\'\'{{}}< >';
+ const result = tokenize(input);
+
+ // Verify all tokens have valid positions
+ result.tokens.forEach((token) => {
+ expect(token.startOffset).toBeGreaterThanOrEqual(0);
+ expect(token.endOffset).toBeGreaterThanOrEqual(token.startOffset!);
+ expect(token.startLine).toBeGreaterThanOrEqual(1);
+ expect(token.startColumn).toBeGreaterThanOrEqual(1);
+ });
+ });
+
+ test('should track positions accurately with tabs and mixed whitespace', () => {
+ const input = '\t\n\t\t\t\tcontent\t\n';
+ const result = tokenize(input);
+
+ // Find tokens and verify their positions make sense
+ const tagOpen = result.tokens.find((t) => t.image === '<' && t.startLine === 1);
+ const innerOpen = result.tokens.find((t) => t.image === '<' && t.startLine === 2);
+
+ expect(tagOpen).toBeDefined();
+ expect(innerOpen).toBeDefined();
+ expect(tagOpen!.startColumn).toBe(2); // After tab
+ expect(innerOpen!.startColumn).toBe(3); // After two tabs
+ });
+
+ test('should verify complete coverage with no gaps', () => {
+ const input = 'content{{var}}';
+ const result = tokenize(input);
+
+ // Sort tokens by start position
+ const sortedTokens = [...result.tokens].sort((a, b) => a.startOffset! - b.startOffset!);
+
+ // Verify complete coverage
+ let expectedOffset = 0;
+ sortedTokens.forEach((token) => {
+ expect(token.startOffset).toBeGreaterThanOrEqual(expectedOffset);
+ expectedOffset = token.endOffset! + 1;
+ });
+
+ // Should cover the entire input
+ expect(expectedOffset).toBeGreaterThanOrEqual(input.length);
+ });
+
+ test('should handle position tracking with comments spanning multiple lines', () => {
+ const input = `text
+
+more text`;
+
+ const result = tokenize(input);
+ const commentToken = result.tokens.find((t) => t.tokenType.name === 'CommentOpen');
+
+ expect(commentToken).toBeDefined();
+ expect(commentToken!.startLine).toBe(2);
+ expect(commentToken!.endLine).toBe(2);
+ });
+
+ test('should handle position tracking with carriage returns', () => {
+ const input = 'line1\r\rcontent\r';
+ const result = tokenize(input);
+
+ // Check that line numbers increase correctly
+ const lines = new Set(result.tokens.map((t) => t.startLine));
+ expect(lines.size).toBeGreaterThan(1);
+
+ // Verify positions are sequential
+ result.tokens.forEach((token) => {
+ expect(token.startOffset).toBeGreaterThanOrEqual(0);
+ expect(token.endOffset).toBeGreaterThanOrEqual(token.startOffset!);
+ });
+ });
+});
+
+describe('Performance and Stress Tests', () => {
+ test('should handle extremely long text content without performance degradation', () => {
+ const longText = 'a'.repeat(1000000); // 1MB of text
+ const start = performance.now();
+ const result = tokenize(longText);
+ const end = performance.now();
+
+ expect(result.errors).toHaveLength(0);
+ expect(result.tokens).toHaveLength(1);
+ expect(result.tokens[0].image).toBe(longText);
+ expect(end - start).toBeLessThan(1000); // Should complete in under 1 second
+ });
+
+ test('should handle very long comments efficiently', () => {
+ const longComment = ``;
+ const start = performance.now();
+ const result = tokenize(longComment);
+ const end = performance.now();
+
+ expect(result.errors).toHaveLength(0);
+ expect(result.tokens).toHaveLength(3);
+ expect(end - start).toBeLessThan(500); // Should be fast
+ });
+
+ test('should handle many small tokens efficiently', () => {
+ const manyTokens = Array(10000).fill('content').join(' ');
+ const start = performance.now();
+ const result = tokenize(manyTokens);
+ const end = performance.now();
+
+ expect(result.errors).toHaveLength(0);
+ expect(result.tokens.length).toBeGreaterThan(10000);
+ expect(end - start).toBeLessThan(2000); // Should handle many tokens
+ });
+
+ test('should handle deeply nested template variables', () => {
+ let nested = '';
+ for (let i = 0; i < 1000; i++) {
+ nested += `{{var${i}}}`;
+ }
+
+ const start = performance.now();
+ const result = tokenize(nested);
+ const end = performance.now();
+
+ expect(result.errors).toHaveLength(0);
+ expect(result.tokens.length).toBe(3000); // 1000 * (open + content + close)
+ expect(end - start).toBeLessThan(1000);
+ });
+
+ test('should handle memory efficiently with large repetitive content', () => {
+ const pattern = '{{content}}';
+ const repeated = Array(1000).fill(pattern).join('\n');
+
+ const start = performance.now();
+ const result = tokenize(repeated);
+ const end = performance.now();
+
+ expect(result.errors).toHaveLength(0);
+ expect(result.tokens.length).toBeGreaterThan(5000);
+ expect(end - start).toBeLessThan(1500);
+ });
+
+ test('should handle worst-case regex backtracking scenarios', () => {
+ // Patterns that could cause regex catastrophic backtracking
+ const backtrackingTests = [
+ 'a'.repeat(10000) + 'b',
+ '{'.repeat(5000) + '}',
+ '<'.repeat(1000) + '>',
+ '"'.repeat(2000),
+ '',
+ Array(1000).fill('{{}}').join(''),
+ ];
+
+ backtrackingTests.forEach((test) => {
+ const start = performance.now();
+ const result = tokenize(test);
+ const end = performance.now();
+
+ expect(result.errors).toHaveLength(0);
+ expect(end - start).toBeLessThan(1000); // Should not hang
+ });
+ });
+
+ test('should maintain linear performance with input size', () => {
+ const sizes = [1000, 5000, 10000, 20000];
+ const times: number[] = [];
+
+ sizes.forEach((size) => {
+ const content = 'x'.repeat(size);
+ const start = performance.now();
+ tokenize(content);
+ const end = performance.now();
+ times.push(end - start);
+ });
+
+ // Performance should scale roughly linearly
+ expect(times[1]).toBeLessThan(times[0] * 10);
+ expect(times[2]).toBeLessThan(times[1] * 5);
+ expect(times[3]).toBeLessThan(times[2] * 3);
+ });
+
+ test('should handle maximum practical input sizes', () => {
+ // Test with 10MB of content
+ const hugeContent = Array(10000).fill('content').join(' ');
+ expect(hugeContent.length).toBe(10000 * 19 - 1);
+
+ const start = performance.now();
+ const result = tokenize(hugeContent);
+ const end = performance.now();
+
+ expect(result.errors).toHaveLength(0);
+ expect(result.tokens.length).toBeGreaterThan(0);
+ expect(end - start).toBeLessThan(5000); // 5 second max for 10MB
+ });
+});
+
+describe('Error Recovery', () => {
+ test('should handle incomplete template variables', () => {
+ const result = tokenize('text {{incomplete');
+ expect(result.errors).toHaveLength(0);
+ expect(result.tokens.length).toBeGreaterThan(0);
+
+ const types = result.tokens.map((t) => t.tokenType);
+ expect(types).toContain(Identifier);
+ expect(types).toContain(TemplateOpen);
+ });
+
+ test('should handle unclosed comments', () => {
+ const result = tokenize('