Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions src/App.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import { fetchClimateAnomalies } from '@/services/climate';
import { enrichEventsWithExposure } from '@/services/population-exposure';
import { buildMapUrl, debounce, loadFromStorage, parseMapUrlState, saveToStorage, ExportPanel, getCircuitBreakerCooldownInfo, isMobileDevice, setTheme, getCurrentTheme } from '@/utils';
import { reverseGeocode } from '@/utils/reverse-geocode';
import { tokenizeForMatch, matchKeyword } from '@/utils/keyword-match';
import { CountryBriefPage } from '@/components/CountryBriefPage';
import { maybeShowDownloadBanner } from '@/components/DownloadBanner';
import { mountCommunityWidget } from '@/components/CommunityWidget';
Expand Down Expand Up @@ -3166,15 +3167,13 @@ export class App {
}

private findFlashLocation(title: string): { lat: number; lon: number } | null {
const titleLower = title.toLowerCase();
let bestMatch: { lat: number; lon: number; matches: number } | null = null;

const tokens = tokenizeForMatch(title);
const countKeywordMatches = (keywords: string[] | undefined): number => {
if (!keywords) return 0;
let matches = 0;
for (const keyword of keywords) {
const cleaned = keyword.trim().toLowerCase();
if (cleaned.length >= 3 && titleLower.includes(cleaned)) {
if (matchKeyword(tokens, keyword)) {
matches++;
}
}
Expand Down
14 changes: 7 additions & 7 deletions src/components/DeckGLMap.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
* Uses deck.gl for high-performance rendering of large datasets
* Mobile devices gracefully degrade to the D3/SVG-based Map component
*/
import { tokenizeForMatch, matchKeyword, matchesAnyKeyword, findMatchingKeywords } from '@/utils/keyword-match';
import { MapboxOverlay } from '@deck.gl/mapbox';
import type { Layer, LayersList, PickingInfo } from '@deck.gl/core';
import { GeoJsonLayer, ScatterplotLayer, PathLayer, IconLayer, TextLayer } from '@deck.gl/layers';
Expand Down Expand Up @@ -3382,10 +3383,9 @@ export class DeckGLMap {
const matchCounts = new Map<string, number>();

recentNews.forEach(item => {
const tokens = tokenizeForMatch(item.title);
this.hotspots.forEach(hotspot => {
if (hotspot.keywords.some(kw =>
item.title.toLowerCase().includes(kw.toLowerCase())
)) {
if (matchesAnyKeyword(tokens, hotspot.keywords)) {
breakingKeywords.add(hotspot.id);
matchCounts.set(hotspot.id, (matchCounts.get(hotspot.id) || 0) + 1);
}
Expand All @@ -3411,22 +3411,22 @@ export class DeckGLMap {

return this.news
.map((item) => {
const titleLower = item.title.toLowerCase();
const matchedKeywords = hotspot.keywords.filter((kw) => titleLower.includes(kw.toLowerCase()));
const tokens = tokenizeForMatch(item.title);
const matchedKeywords = findMatchingKeywords(tokens, hotspot.keywords);

if (matchedKeywords.length === 0) return null;

// Check if this news mentions other hotspot conflict topics
const conflictMatches = conflictTopics.filter(t =>
titleLower.includes(t) && !hotspot.keywords.some(k => k.toLowerCase().includes(t))
matchKeyword(tokens, t) && !hotspot.keywords.some(k => k.toLowerCase().includes(t))
);

// If article mentions a major conflict topic that isn't this hotspot, deprioritize heavily
if (conflictMatches.length > 0) {
// Only include if it ALSO has a strong local keyword (city name, agency)
const strongLocalMatch = matchedKeywords.some(kw =>
kw.toLowerCase() === hotspot.name.toLowerCase() ||
hotspot.agencies?.some(a => titleLower.includes(a.toLowerCase()))
hotspot.agencies?.some(a => matchKeyword(tokens, a))
);
if (!strongLocalMatch) return null;
}
Expand Down
13 changes: 7 additions & 6 deletions src/components/Map.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import * as d3 from 'd3';
import * as topojson from 'topojson-client';
import { escapeHtml } from '@/utils/sanitize';
import { tokenizeForMatch, matchKeyword, findMatchingKeywords } from '@/utils/keyword-match';
import { getCSSColor } from '@/utils';
import type { Topology, GeometryCollection } from 'topojson-specification';
import type { Feature, Geometry } from 'geojson';
Expand Down Expand Up @@ -2737,22 +2738,22 @@ export class MapComponent {

return this.news
.map((item) => {
const titleLower = item.title.toLowerCase();
const matchedKeywords = hotspot.keywords.filter((kw) => titleLower.includes(kw.toLowerCase()));
const tokens = tokenizeForMatch(item.title);
const matchedKeywords = findMatchingKeywords(tokens, hotspot.keywords);

if (matchedKeywords.length === 0) return null;

// Check if this news mentions other hotspot conflict topics
const conflictMatches = conflictTopics.filter(t =>
titleLower.includes(t) && !hotspot.keywords.some(k => k.toLowerCase().includes(t))
matchKeyword(tokens, t) && !hotspot.keywords.some(k => k.toLowerCase().includes(t))
);

// If article mentions a major conflict topic that isn't this hotspot, deprioritize heavily
if (conflictMatches.length > 0) {
// Only include if it ALSO has a strong local keyword (city name, agency)
const strongLocalMatch = matchedKeywords.some(kw =>
kw.toLowerCase() === hotspot.name.toLowerCase() ||
hotspot.agencies?.some(a => titleLower.includes(a.toLowerCase()))
hotspot.agencies?.some(a => matchKeyword(tokens, a))
);
if (!strongLocalMatch) return null;
}
Expand All @@ -2776,8 +2777,8 @@ export class MapComponent {
let matchedCount = 0;

news.forEach((item) => {
const titleLower = item.title.toLowerCase();
const matches = spot.keywords.filter((kw) => titleLower.includes(kw.toLowerCase()));
const tokens = tokenizeForMatch(item.title);
const matches = findMatchingKeywords(tokens, spot.keywords);

if (matches.length > 0) {
matchedCount++;
Expand Down
4 changes: 2 additions & 2 deletions src/config/geo.ts
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ export const INTEL_HOTSPOTS: Hotspot[] = [
lat: 38.9,
lon: -77.0,
location: 'Washington D.C., USA',
keywords: ['pentagon', 'white house', 'congress', 'cia', 'nsa', 'washington', 'biden', 'trump', 'house', 'senate', 'supreme court', 'vance', 'elon', 'us '],
keywords: ['pentagon', 'white house', 'congress', 'cia', 'nsa', 'washington', 'biden', 'trump', 'senate', 'supreme court', 'vance', 'elon'],
agencies: ['Pentagon', 'CIA', 'NSA', 'State Dept'],
description: 'US government and military headquarters. Intelligence community center.',
status: 'Monitoring',
Expand Down Expand Up @@ -375,7 +375,7 @@ export const INTEL_HOTSPOTS: Hotspot[] = [
lat: 33.5,
lon: 36.3,
location: 'Syria',
keywords: ['syria', 'damascus', 'assad', 'syrian', 'hts'],
keywords: ['syria', 'damascus', 'assad', 'syrian', 'hts', 'tahrir al-sham', 'hayat tahrir'],
agencies: ['Syrian Govt', 'HTS', 'Russian Forces', 'Turkish Forces'],
description: 'Syrian civil war aftermath. Multiple foreign interventions.',
status: 'Monitoring',
Expand Down
11 changes: 6 additions & 5 deletions src/services/country-instability.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import type { SocialUnrestEvent, MilitaryFlight, MilitaryVessel, ClusteredEvent, InternetOutage } from '@/types';
import { INTEL_HOTSPOTS, CONFLICT_ZONES, STRATEGIC_WATERWAYS } from '@/config/geo';
import { TIER1_COUNTRIES } from '@/config/countries';
import { tokenizeForMatch, matchKeyword } from '@/utils/keyword-match';
import { focalPointDetector } from './focal-point-detector';
import type { ConflictEvent, UcdpConflictStatus, HapiConflictSummary } from './conflict';
import type { CountryDisplacement } from '@/services/displacement';
Expand Down Expand Up @@ -196,12 +197,12 @@ export { COUNTRY_BOUNDS };
export type { CountryData };

function normalizeCountryName(name: string): string | null {
const lower = name.toLowerCase();
const tokens = tokenizeForMatch(name);
for (const [code, keywords] of Object.entries(COUNTRY_KEYWORDS)) {
if (keywords.some(kw => lower.includes(kw))) return code;
if (keywords.some(kw => matchKeyword(tokens, kw))) return code;
}
for (const [code, countryName] of Object.entries(TIER1_COUNTRIES)) {
if (lower.includes(countryName.toLowerCase())) return code;
if (matchKeyword(tokens, countryName)) return code;
}
return null;
}
Expand Down Expand Up @@ -455,10 +456,10 @@ export function ingestMilitaryForCII(flights: MilitaryFlight[], vessels: Militar

export function ingestNewsForCII(events: ClusteredEvent[]): void {
for (const e of events) {
const title = e.primaryTitle.toLowerCase();
const tokens = tokenizeForMatch(e.primaryTitle);
for (const [code] of Object.entries(TIER1_COUNTRIES)) {
const keywords = COUNTRY_KEYWORDS[code] || [];
if (keywords.some(kw => title.includes(kw))) {
if (keywords.some(kw => matchKeyword(tokens, kw))) {
if (!countryDataMap.has(code)) countryDataMap.set(code, initCountryData());
countryDataMap.get(code)!.newsEvents.push(e);
}
Expand Down
8 changes: 4 additions & 4 deletions src/services/entity-index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,6 @@ export function findEntitiesInText(text: string): EntityMatch[] {
const index = getEntityIndex();
const matches: EntityMatch[] = [];
const seen = new Set<string>();
const textLower = text.toLowerCase();

for (const [alias, entityId] of index.byAlias) {
if (alias.length < 3) continue;
Expand All @@ -123,18 +122,19 @@ export function findEntitiesInText(text: string): EntityMatch[] {

for (const [keyword, entityIds] of index.byKeyword) {
if (keyword.length < 3) continue;
if (!textLower.includes(keyword)) continue;
const kwRegex = new RegExp(`\\b${escapeRegex(keyword)}\\b`, 'gi');
const kwMatch = kwRegex.exec(text);
if (!kwMatch) continue;

for (const entityId of entityIds) {
if (seen.has(entityId)) continue;

const pos = textLower.indexOf(keyword);
matches.push({
entityId,
matchedText: keyword,
matchType: 'keyword',
confidence: 0.7,
position: pos,
position: kwMatch.index,
});
seen.add(entityId);
}
Expand Down
14 changes: 3 additions & 11 deletions src/services/geo-hub-index.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
// Geopolitical Hub Index - aggregates news by strategic locations
import { tokenizeForMatch, matchKeyword } from '@/utils/keyword-match';

export interface GeoHubLocation {
id: string;
Expand Down Expand Up @@ -109,20 +110,11 @@ export interface GeoHubMatch {
export function inferGeoHubsFromTitle(title: string): GeoHubMatch[] {
const index = buildGeoHubIndex();
const matches: GeoHubMatch[] = [];
const titleLower = title.toLowerCase();
const tokens = tokenizeForMatch(title);
const seenHubs = new Set<string>();

for (const [keyword, hubIds] of index.byKeyword) {
if (keyword.length < 2) continue;

// Word boundary check for short keywords to avoid false positives
const regex = keyword.length < 5
? new RegExp(`\\b${keyword}\\b`, 'i')
: null;

const found = regex
? regex.test(titleLower)
: titleLower.includes(keyword);
const found = matchKeyword(tokens, keyword);

if (found) {
for (const hubId of hubIds) {
Expand Down
13 changes: 5 additions & 8 deletions src/services/related-assets.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import type { ClusteredEvent, RelatedAsset, AssetType, RelatedAssetContext } from '@/types';
import { t } from '@/services/i18n';
import { tokenizeForMatch, matchKeyword } from '@/utils/keyword-match';
import {
INTEL_HOTSPOTS,
CONFLICT_ZONES,
Expand Down Expand Up @@ -27,24 +28,20 @@ interface AssetOrigin {
label: string;
}

function toTitleLower(titles: string[]): string[] {
return titles.map(title => title.toLowerCase());
}

function detectAssetTypes(titles: string[]): AssetType[] {
const normalized = toTitleLower(titles);
const tokenized = titles.map(t => tokenizeForMatch(t));
const types = Object.entries(ASSET_KEYWORDS)
.filter(([, keywords]) =>
normalized.some(title => keywords.some(keyword => title.includes(keyword)))
tokenized.some(tokens => keywords.some(keyword => matchKeyword(tokens, keyword)))
)
.map(([type]) => type as AssetType);
return types;
}

function countKeywordMatches(titles: string[], keywords: string[]): number {
const normalized = toTitleLower(titles);
const tokenized = titles.map(t => tokenizeForMatch(t));
return keywords.reduce((count, keyword) => {
return count + normalized.filter(title => title.includes(keyword)).length;
return count + tokenized.filter(tokens => matchKeyword(tokens, keyword)).length;
}, 0);
}

Expand Down
9 changes: 5 additions & 4 deletions src/services/story-data.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import { calculateCII, type CountryScore } from './country-instability';
import type { ClusteredEvent } from '@/types';
import type { ThreatLevel } from './threat-classifier';
import { tokenizeForMatch, matchKeyword } from '@/utils/keyword-match';

const COUNTRY_KEYWORDS: Record<string, string[]> = {
US: ['united states', 'usa', 'america', 'washington', 'biden', 'trump', 'pentagon'],
Expand Down Expand Up @@ -87,8 +88,8 @@ export function collectStoryData(

const keywords = COUNTRY_KEYWORDS[countryCode] || [countryName.toLowerCase()];
const countryNews = allNews.filter(e => {
const lower = e.primaryTitle.toLowerCase();
return keywords.some(kw => lower.includes(kw));
const tokens = tokenizeForMatch(e.primaryTitle);
return keywords.some(kw => matchKeyword(tokens, kw));
});

const sortedNews = [...countryNews].sort((a, b) => {
Expand All @@ -104,8 +105,8 @@ export function collectStoryData(
) || null;

const countryMarkets = predictionMarkets.filter(m => {
const lower = m.title.toLowerCase();
return keywords.some(kw => lower.includes(kw));
const tokens = tokenizeForMatch(m.title);
return keywords.some(kw => matchKeyword(tokens, kw));
});

const threatCounts = { critical: 0, high: 0, medium: 0, categories: new Set<string>() };
Expand Down
5 changes: 3 additions & 2 deletions src/services/tech-hub-index.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import { STARTUP_ECOSYSTEMS } from '@/config/startup-ecosystems';
import { TECH_COMPANIES } from '@/config/tech-companies';
import { STARTUP_HUBS } from '@/config/tech-geo';
import { tokenizeForMatch, matchKeyword } from '@/utils/keyword-match';

export interface TechHubLocation {
id: string;
Expand Down Expand Up @@ -211,14 +212,14 @@ export interface HubMatch {
export function inferHubsFromTitle(title: string): HubMatch[] {
const index = buildTechHubIndex();
const matches: HubMatch[] = [];
const titleLower = title.toLowerCase();
const titleTokens = tokenizeForMatch(title);
const seenHubs = new Set<string>();

// Check each keyword
for (const [keyword, hubIds] of index.byKeyword) {
if (keyword.length < 3) continue; // Skip very short keywords

if (titleLower.includes(keyword)) {
if (matchKeyword(titleTokens, keyword)) {
for (const hubId of hubIds) {
if (seenHubs.has(hubId)) continue;
seenHubs.add(hubId);
Expand Down
64 changes: 64 additions & 0 deletions src/utils/keyword-match.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/**
* Tokenization-based keyword matching for geo-tagging.
* Single source of truth — all geo/hotspot keyword matching imports from here.
*
* Uses Set-based lookups (O(1)) instead of regex to eliminate:
* - Substring false positives ("assad" inside "ambassador")
* - Per-keyword RegExp allocations in hot loops
*
* @see https://github.com/koala73/worldmonitor/issues/324
*/

export interface TokenizedTitle {
/** Unique lowercase words for O(1) single-word lookups */
words: Set<string>;
/** Ordered lowercase words for contiguous phrase matching */
ordered: string[];
}

/**
* Tokenize a title into lowercase words.
* Call once per title, reuse across all keyword checks.
*/
export function tokenizeForMatch(title: string): TokenizedTitle {
const ordered = title.toLowerCase().split(/[^a-z0-9'-]+/).filter(w => w.length > 0);
return { words: new Set(ordered), ordered };
}

/**
* Check if a single keyword matches within a tokenized title.
* - Single-word keywords: O(1) Set lookup
* - Multi-word keywords (e.g. "white house"): contiguous phrase search
*/
export function matchKeyword(tokens: TokenizedTitle, keyword: string): boolean {
const parts = keyword.toLowerCase().split(/\s+/).filter(w => w.length > 0);
if (parts.length === 0) return false;

if (parts.length === 1) {
return tokens.words.has(parts[0]!);
}

// Multi-word: find contiguous phrase in ordered tokens
const { ordered } = tokens;
for (let i = 0; i <= ordered.length - parts.length; i++) {
let match = true;
for (let j = 0; j < parts.length; j++) {
if (ordered[i + j] !== parts[j]) {
match = false;
break;
}
}
if (match) return true;
}
return false;
}

/** Check if any keyword in the list matches the tokenized title. */
export function matchesAnyKeyword(tokens: TokenizedTitle, keywords: string[]): boolean {
return keywords.some(kw => matchKeyword(tokens, kw));
}

/** Return all keywords that match the tokenized title. */
export function findMatchingKeywords(tokens: TokenizedTitle, keywords: string[]): string[] {
return keywords.filter(kw => matchKeyword(tokens, kw));
}
Loading