From 9bcbd10e36c3c0c6438232c717a7d57ce46cefb2 Mon Sep 17 00:00:00 2001 From: Chimdi Ejiogu Date: Sat, 11 Oct 2025 21:30:38 -0400 Subject: [PATCH 01/15] Update web scraping for printers --- .DS_Store | Bin 6148 -> 0 bytes src/data/scrapers/printers.py | 234 ++++++++++++++++++++++++++++++---- 2 files changed, 211 insertions(+), 23 deletions(-) delete mode 100644 .DS_Store diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index 5b928bb719a93f6dc27edc3b82270d14a3109785..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHK!AiqG5S?wSO({wb3LY1{R*X?v#7n6914i_qQky1dFwK^vwTDv3S%1hc@q3)v z-Ac9U!Gnt3f!Q}ZJF{V4!fpltSZf&U0Mr1$LM1FzaQHxIopeqL+EYSga*q&F*n%zy z=*>kN$6sWC-rXt$_{O?$2ea?5kG>2-f0Xt8Soj$Ci8cYa8RSv)`!mJ=>Yo`FP*4(cW!LCKYRabNk?=eHZr<@t}xh_zg-~(Kv-iG`_6y z-b<5MBsUmGdla$m6uQ4%dDGdg^PT2Q<&C{>RIaOeSL2i$gN6aaz+y6>&r7SinB|+Q z4FiUOFBqWnfuj<-8gqs6=)gg?0EqM(DFtonB`Ak$bT#G*aRr5`R791^bc?}MI{LYe zb2a7)RXQ-;d@%iFraKfSpN{);84k=]+>") +BRACKET_CONTENT_RE = re.compile(r"[\(\[\{].*?[\)\]\}]") +MULTI_SPACE_RE = re.compile(r"\s+") +DELIMS_RE = re.compile(r"\s*[-–—:/|]\s*") +COORD_SPLIT_RE = re.compile(r"\s*,\s*") +ALL_CAPS_PHRASE_RE = re.compile(r"\b[A-Z]{2,}(?:\s+[A-Z]{2,})*\b") +TRAILING_CAPS_RE = re.compile(r"\b[A-Z]{2,}(?:\s+[A-Z]{2,})*\s*$") +LABEL_PHRASES_RE = re.compile( + r""" + \bresidents?\s*only\b | + \bstudents?\s*only\b | + \baa\s*&\s*p\b | + \baap\b + """, re.IGNORECASE | re.VERBOSE +) +RESIDUAL_TRAILING_LABEL_RE = re.compile( + r"\b(?:resident|residents|student|students|staff|public)\b\s*$", + re.IGNORECASE +) + +def _norm(s): + """ + Unicode/HTML/whitespace normalization. + """ + if s is None: + return "" + s = unicodedata.normalize('NFKC', s) # Normalizes unicode text + s = HTML_TAG_RE.sub(" ", s) + s = s.replace("*", " ") + s = BRACKET_CONTENT_RE.sub(" ", s) + s = MULTI_SPACE_RE.sub(" ", s).strip() + return s + +def _strip_trailing_allcaps(s): + """ + Remove trailing ALL-CAPS qualifiers (e.g., RESIDENTS ONLY). + """ + return TRAILING_CAPS_RE.sub("", s).strip() + +# def _title_clean(s: str) -> str: +# """ +# Nice display casing: keep acronyms as-is, titlecase other words. +# """ +# words = s.split() +# fixed = [w if w.isupper() else w.title() for w in words] +# return " ".join(fixed) + +def _pre_clean_for_match(s: str) -> str: + s = _norm(s) + s = LABEL_PHRASES_RE.sub(" ", s) # <— removes "Resident(s) only", "AA&P", etc. + s = _strip_trailing_allcaps(s) + s = RESIDUAL_TRAILING_LABEL_RE.sub(" ", s) # <— removes "Resident", "Students", etc. + + s = re.sub(r"[^\w\s\-’']", " ", s) # punctuation noise + s = re.sub(r"\s+", " ", s).strip() + return s + +def _token_sort(s): + tokens = s.lower().split() + tokens.sort() + return " ".join(tokens) + +def map_building(name, threshold=87): + if not name: + return None, 0 + + query = _token_sort(_pre_clean_for_match(name)) + canon_token_list = [_token_sort(_pre_clean_for_match(c)) for c in CANONICAL_BUILDINGS] + + best = get_close_matches(query, canon_token_list, n=1) # Returns a list of the (top-1) closest match to the cleaned name - # Locate the table - table = soup.find("table", {"id": "directoryTable"}) - rows = table.find("tbody").find_all("tr") + # If no matches (empty list), return the original name and 0 + if not best: + return name, 0 - # Extract data + # Return the closest match and its similarity score + match = best[0] + + # Calculate the similarity score of the match to the original name (for internal use, potential debugging purposes) + index = canon_token_list.index(match) + canon_raw = CANONICAL_BUILDINGS[index] + score = int(SequenceMatcher(None, query, match).ratio() * 100) + + # If the score is below the threshold, return the original name instead of the canonical name + return (canon_raw, score) if score >= threshold else (name, score) + +def map_labels(description): + """ + Extract label tokens from the description. + """ + if not description: + return [], description + + labels = LABEL_PHRASES_RE.findall(description) + labels = [label.title().replace("Aa&P", "AA&P").replace("Aap", "AA&P") for label in labels] + description = LABEL_PHRASES_RE.sub("", description).strip() + description = RESIDUAL_TRAILING_LABEL_RE.sub("", description).strip() + description = MULTI_SPACE_RE.sub(" ", description) + + return labels, description + +def fetch_printers_json(): + """ + Fetch printer data in JSON format from the CU Print directory endpoint. + """ + resp = requests.get(URL, headers=HEADERS, timeout=20) + resp.raise_for_status() + return resp.json() + +def scrape_printers(): + """ + Scrape CU Print printer locations from the Cornell directory page. + """ + payload = fetch_printers_json() data = [] - for row in rows: - cols = row.find_all("td") - if len(cols) < 3: # Ensure row has enough columns - continue - - location_name = cols[0].text.strip() - description = cols[1].text.strip() - - # Extract coordinates from the hyperlink tag inside - coordinates_link = cols[2].find("a") - coordinates_string = coordinates_link.text.strip() if coordinates_link else "" - coordinates = [float(x) for x in coordinates_string.split(', ')] + # payload['rows'] is a list of lists, where each inner list represents a row of data + for row in payload['rows']: + if len(row) < 3: # Ensure row has enough columns + continue # Skipping row with insufficient columns + + # Each row is of the structure ["Building", "Equipment & Location", "Coordinates (Lat, Lng)"] + [raw_building, raw_location, raw_coordinates] = row + + # Map raw building name to canonical building name + building, score = map_building(raw_building) + + # Map labels from description to canonical labels + # TODO: Handle description (parse for room number, etc.) + description = raw_location + + # Splits coordinates string into a list of floats + coordinates = [float(x) for x in raw_coordinates.split(', ')] data.append({ - "Location": location_name, + "Location": building, "Description": description, "Coordinates": coordinates }) - return data \ No newline at end of file + + return data \ No newline at end of file From 772a893f085b70a7c86991fd0d7e0813461fa616 Mon Sep 17 00:00:00 2001 From: Chimdi Ejiogu Date: Sat, 11 Oct 2025 23:21:54 -0400 Subject: [PATCH 02/15] Implement baseplate labeling for scraped data --- src/data/scrapers/printers.py | 63 +++++++++++++++++++++++++---------- 1 file changed, 45 insertions(+), 18 deletions(-) diff --git a/src/data/scrapers/printers.py b/src/data/scrapers/printers.py index df0161b..acddffe 100644 --- a/src/data/scrapers/printers.py +++ b/src/data/scrapers/printers.py @@ -89,6 +89,8 @@ COORD_SPLIT_RE = re.compile(r"\s*,\s*") ALL_CAPS_PHRASE_RE = re.compile(r"\b[A-Z]{2,}(?:\s+[A-Z]{2,})*\b") TRAILING_CAPS_RE = re.compile(r"\b[A-Z]{2,}(?:\s+[A-Z]{2,})*\s*$") + +# Used for stripping common label phrases from building names LABEL_PHRASES_RE = re.compile( r""" \bresidents?\s*only\b | @@ -97,6 +99,25 @@ \baap\b """, re.IGNORECASE | re.VERBOSE ) + +# Used to identify common variants of labels +LABEL_PATTERNS = { + # Residents Only (singular/plural + optional hyphen + any case) + "Residents Only": re.compile(r"\bresident[s]?[-\s]*only\b", re.IGNORECASE), + + # AA&P Students Only (accept AA&P or AAP; allow any junk in-between; optional hyphen) + "AA&P Students Only": re.compile( + r"\b(?:aa\s*&\s*p|aap)\b.*\bstudent[s]?[-\s]*only\b", + re.IGNORECASE + ), + + # Landscape Architecture Students Only (allow arbitrary whitespace; optional hyphen) + "Landscape Architecture Students Only": re.compile( + r"\blandscape\s+architecture\b.*\bstudent[s]?[-\s]*only\b", + re.IGNORECASE + ), +} + RESIDUAL_TRAILING_LABEL_RE = re.compile( r"\b(?:resident|residents|student|students|staff|public)\b\s*$", re.IGNORECASE @@ -121,14 +142,6 @@ def _strip_trailing_allcaps(s): """ return TRAILING_CAPS_RE.sub("", s).strip() -# def _title_clean(s: str) -> str: -# """ -# Nice display casing: keep acronyms as-is, titlecase other words. -# """ -# words = s.split() -# fixed = [w if w.isupper() else w.title() for w in words] -# return " ".join(fixed) - def _pre_clean_for_match(s: str) -> str: s = _norm(s) s = LABEL_PHRASES_RE.sub(" ", s) # <— removes "Resident(s) only", "AA&P", etc. @@ -168,20 +181,25 @@ def map_building(name, threshold=87): # If the score is below the threshold, return the original name instead of the canonical name return (canon_raw, score) if score >= threshold else (name, score) -def map_labels(description): +def map_labels(text): """ Extract label tokens from the description. """ - if not description: - return [], description + if not text: + return [] + + cleaned = _norm(text) + found_labels = [] - labels = LABEL_PHRASES_RE.findall(description) - labels = [label.title().replace("Aa&P", "AA&P").replace("Aap", "AA&P") for label in labels] - description = LABEL_PHRASES_RE.sub("", description).strip() - description = RESIDUAL_TRAILING_LABEL_RE.sub("", description).strip() - description = MULTI_SPACE_RE.sub(" ", description) + for canon, pattern in LABEL_PATTERNS.items(): + # Search for the pattern in the cleaned text + if pattern.search(cleaned): + found_labels.append(canon) - return labels, description + # Remove the found label from the text to avoid duplicates + cleaned = pattern.sub("", cleaned).strip() + + return sorted(set(found_labels)) def fetch_printers_json(): """ @@ -210,6 +228,14 @@ def scrape_printers(): building, score = map_building(raw_building) # Map labels from description to canonical labels + labels = [] + + labels.extend(map_labels(raw_building)) # Get labels from the building name (e.g., "Residents Only") + labels.extend(map_labels(raw_location)) # Get labels from the location description (e.g., "Landscape Architecture Student ONLY") + + # Deduplicate and sort labels + labels = sorted(set(labels)) + # TODO: Handle description (parse for room number, etc.) description = raw_location @@ -219,7 +245,8 @@ def scrape_printers(): data.append({ "Location": building, "Description": description, - "Coordinates": coordinates + "Coordinates": coordinates, + "Labels": labels }) return data \ No newline at end of file From 4b008e458302e3cb74a646ed71c3b53617b25730 Mon Sep 17 00:00:00 2001 From: Chimdi Ejiogu Date: Sat, 11 Oct 2025 23:28:45 -0400 Subject: [PATCH 03/15] Add labels for printer colors --- src/data/scrapers/printers.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/src/data/scrapers/printers.py b/src/data/scrapers/printers.py index acddffe..5a568aa 100644 --- a/src/data/scrapers/printers.py +++ b/src/data/scrapers/printers.py @@ -74,12 +74,7 @@ "White Hall", "Willard Student Center" ] - -CANONICAL_LABELS = [ - "Residents Only", - "AA&P Students Only", - "Landscape Architecture Students Only" -] +# Add more buildings as needed... # Regex helpers HTML_TAG_RE = re.compile(r"<[^>]+>") @@ -102,6 +97,7 @@ # Used to identify common variants of labels LABEL_PATTERNS = { + # --- Access restrictions --- # Residents Only (singular/plural + optional hyphen + any case) "Residents Only": re.compile(r"\bresident[s]?[-\s]*only\b", re.IGNORECASE), @@ -116,6 +112,15 @@ r"\blandscape\s+architecture\b.*\bstudent[s]?[-\s]*only\b", re.IGNORECASE ), + + # --- Printer capabilities --- + "Color": re.compile(r"\bcolor\b", re.IGNORECASE), + "Black & White": re.compile( + r"\b(?:black\s*(?:and|&)\s*white|b\s*&\s*w)\b", re.IGNORECASE + ), + "Color, Scan, & Copy": re.compile( + r"\bcolor[,/ &]*(scan|copy|print|copying)+\b", re.IGNORECASE + ), } RESIDUAL_TRAILING_LABEL_RE = re.compile( @@ -198,7 +203,8 @@ def map_labels(text): # Remove the found label from the text to avoid duplicates cleaned = pattern.sub("", cleaned).strip() - + + return sorted(set(found_labels)) def fetch_printers_json(): @@ -234,7 +240,7 @@ def scrape_printers(): labels.extend(map_labels(raw_location)) # Get labels from the location description (e.g., "Landscape Architecture Student ONLY") # Deduplicate and sort labels - labels = sorted(set(labels)) + labels = sorted(set(labels)) # TODO: Handle description (parse for room number, etc.) description = raw_location From 7f87078d85266b2a62dbbb8ea329558002b6dd25 Mon Sep 17 00:00:00 2001 From: Chimdi Ejiogu Date: Sat, 11 Oct 2025 23:38:29 -0400 Subject: [PATCH 04/15] Update description to exclude labels --- src/data/scrapers/printers.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/data/scrapers/printers.py b/src/data/scrapers/printers.py index 5a568aa..8c76c21 100644 --- a/src/data/scrapers/printers.py +++ b/src/data/scrapers/printers.py @@ -191,7 +191,7 @@ def map_labels(text): Extract label tokens from the description. """ if not text: - return [] + return text, [] cleaned = _norm(text) found_labels = [] @@ -204,8 +204,8 @@ def map_labels(text): # Remove the found label from the text to avoid duplicates cleaned = pattern.sub("", cleaned).strip() - - return sorted(set(found_labels)) + cleaned = re.sub(r"\s+", " ", cleaned).strip() + return cleaned, sorted(set(found_labels)) def fetch_printers_json(): """ @@ -236,14 +236,16 @@ def scrape_printers(): # Map labels from description to canonical labels labels = [] - labels.extend(map_labels(raw_building)) # Get labels from the building name (e.g., "Residents Only") - labels.extend(map_labels(raw_location)) # Get labels from the location description (e.g., "Landscape Architecture Student ONLY") + _, building_labels = map_labels(raw_building) # Get labels from the building name (e.g., "Residents Only") + remainder, location_labels = map_labels(raw_location) # Get labels from the location description (e.g., "Landscape Architecture Student ONLY") # Deduplicate and sort labels + labels += building_labels + labels += location_labels labels = sorted(set(labels)) - - # TODO: Handle description (parse for room number, etc.) - description = raw_location + + cleaned = re.sub(r"^[\s\-–—:/|]+", "", remainder).strip() # Remove leftover delimiters at the start (like " - ", " / ", ": ", etc.) + description = cleaned # Final cleaned description text (with labels removed) — essentially, remainder of the location description # Splits coordinates string into a list of floats coordinates = [float(x) for x in raw_coordinates.split(', ')] From ff5d61be6d0d815a736547582ac545cd72213b93 Mon Sep 17 00:00:00 2001 From: Chimdi Ejiogu Date: Sat, 11 Oct 2025 23:42:50 -0400 Subject: [PATCH 05/15] Add comments/documentation and clean up code --- src/data/scrapers/printers.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/src/data/scrapers/printers.py b/src/data/scrapers/printers.py index 8c76c21..a71e1f7 100644 --- a/src/data/scrapers/printers.py +++ b/src/data/scrapers/printers.py @@ -10,6 +10,7 @@ URL = 'https://www.cornell.edu/about/maps/directory/text-data.cfm?layer=CUPrint&caption=%20CU%20Print%20Printers' +# HTTP headers to mimic a real browser request HEADERS = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36", "Referer": 'https://www.cornell.edu/about/maps/directory/', @@ -17,7 +18,8 @@ "Accept": 'application/json, text/javascript, */*', } -# Canonical list of Cornell buildings; NOTE: This list is not exhaustive. +# Canonical list of Cornell buildings +# NOTE: This list is not exhaustive. Add more buildings as needed... CANONICAL_BUILDINGS = [ "Akwe:kon", "Alice Cook House", @@ -74,15 +76,11 @@ "White Hall", "Willard Student Center" ] -# Add more buildings as needed... # Regex helpers HTML_TAG_RE = re.compile(r"<[^>]+>") BRACKET_CONTENT_RE = re.compile(r"[\(\[\{].*?[\)\]\}]") MULTI_SPACE_RE = re.compile(r"\s+") -DELIMS_RE = re.compile(r"\s*[-–—:/|]\s*") -COORD_SPLIT_RE = re.compile(r"\s*,\s*") -ALL_CAPS_PHRASE_RE = re.compile(r"\b[A-Z]{2,}(?:\s+[A-Z]{2,})*\b") TRAILING_CAPS_RE = re.compile(r"\b[A-Z]{2,}(?:\s+[A-Z]{2,})*\s*$") # Used for stripping common label phrases from building names @@ -123,6 +121,7 @@ ), } +# Used for stripping residual trailing labels from descriptions RESIDUAL_TRAILING_LABEL_RE = re.compile( r"\b(?:resident|residents|student|students|staff|public)\b\s*$", re.IGNORECASE @@ -148,6 +147,9 @@ def _strip_trailing_allcaps(s): return TRAILING_CAPS_RE.sub("", s).strip() def _pre_clean_for_match(s: str) -> str: + """ + Pre-clean a building name for matching against the canonical list. + """ s = _norm(s) s = LABEL_PHRASES_RE.sub(" ", s) # <— removes "Resident(s) only", "AA&P", etc. s = _strip_trailing_allcaps(s) @@ -158,18 +160,25 @@ def _pre_clean_for_match(s: str) -> str: return s def _token_sort(s): + """ + Tokenize a string, sort the tokens, and re-join them. + """ tokens = s.lower().split() tokens.sort() return " ".join(tokens) def map_building(name, threshold=87): + """ + Map a building name to a canonical building name using fuzzy matching. + """ if not name: return None, 0 query = _token_sort(_pre_clean_for_match(name)) canon_token_list = [_token_sort(_pre_clean_for_match(c)) for c in CANONICAL_BUILDINGS] - best = get_close_matches(query, canon_token_list, n=1) # Returns a list of the (top-1) closest match to the cleaned name + # Returns a list of the (top-1) closest match to the cleaned name + best = get_close_matches(query, canon_token_list, n=1) # If no matches (empty list), return the original name and 0 if not best: @@ -231,7 +240,7 @@ def scrape_printers(): [raw_building, raw_location, raw_coordinates] = row # Map raw building name to canonical building name - building, score = map_building(raw_building) + building, _ = map_building(raw_building) # Map labels from description to canonical labels labels = [] From 906c6331f54a5726b01a037b5a7d7b150a8b1e3a Mon Sep 17 00:00:00 2001 From: Chimdi Ejiogu Date: Sun, 12 Oct 2025 00:30:16 -0400 Subject: [PATCH 06/15] Include labels in database creation and population --- src/data/db/database.py | 40 ++++++++++++++++++++++++++++++++- src/data/db/models.py | 24 ++++++++++++++++++++ src/data/scripts/populate_db.py | 2 +- 3 files changed, 64 insertions(+), 2 deletions(-) diff --git a/src/data/db/database.py b/src/data/db/database.py index efdf541..7aba210 100644 --- a/src/data/db/database.py +++ b/src/data/db/database.py @@ -32,7 +32,7 @@ def insert_library(location, address, latitude, longitude): conn.close() -def insert_printer(location, description, latitude, longitude): +def insert_printer(location, description, labels, latitude, longitude): """Insert a printer into the database.""" conn = get_db_connection() cursor = conn.cursor() @@ -44,6 +44,44 @@ def insert_printer(location, description, latitude, longitude): """, (location, description, latitude, longitude), ) + + # Insert labels into the labels table and get their IDs + label_ids = [] + for label in labels: + cursor.execute( + """ + INSERT OR IGNORE INTO labels (label) + VALUES (?) + """, + (label,), + ) + cursor.execute( + """ + SELECT id FROM labels WHERE label = ? + """, + (label,), + ) + label_id = cursor.fetchone()[0] + label_ids.append(label_id) + + # Create entries in the junction table for printer-label relationships + cursor.execute( + """ + SELECT id FROM printers WHERE location = ? AND description = ? AND latitude = ? AND longitude = ? + """, + (location, description, latitude, longitude), + ) + printer_id = cursor.fetchone()[0] + + # Insert into junction table + for label_id in label_ids: + cursor.execute( + """ + INSERT OR IGNORE INTO printer_labels (printer_id, label_id) + VALUES (?, ?) + """, + (printer_id, label_id), + ) conn.commit() conn.close() diff --git a/src/data/db/models.py b/src/data/db/models.py index 7634fd0..5499c30 100644 --- a/src/data/db/models.py +++ b/src/data/db/models.py @@ -15,6 +15,7 @@ def create_tables(): conn = sqlite3.connect(DB_PATH) cursor = conn.cursor() + #TODO: Remove UNIQUE constraint from location cursor.execute( """ CREATE TABLE IF NOT EXISTS libraries ( @@ -50,6 +51,29 @@ def create_tables(): ) """ ) + + # Table for storing unique labels + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS labels ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + label TEXT UNIQUE NOT NULL + ) + """ + ) + + # Junction table for many-to-many relationship between printers and labels + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS printer_labels ( + printer_id INTEGER NOT NULL, + label_id INTEGER NOT NULL, + PRIMARY KEY (printer_id, label_id), + FOREIGN KEY (printer_id) REFERENCES printers(id) ON DELETE CASCADE, + FOREIGN KEY (label_id) REFERENCES labels(id) ON DELETE CASCADE + ) + """ + ) conn.commit() conn.close() diff --git a/src/data/scripts/populate_db.py b/src/data/scripts/populate_db.py index fa6a23f..c84cd1b 100644 --- a/src/data/scripts/populate_db.py +++ b/src/data/scripts/populate_db.py @@ -18,7 +18,7 @@ def populate_db(): # Insert printers printers = scrape_printers() for printer in printers: - insert_printer(printer['Location'], printer['Description'], printer['Coordinates'][0], printer['Coordinates'][1]) + insert_printer(printer['Location'], printer['Description'], printers['Labels'], printer['Coordinates'][0], printer['Coordinates'][1]) if __name__ == "__main__": populate_db() \ No newline at end of file From de836bce1011263ea3b3082ba7334da688a518ac Mon Sep 17 00:00:00 2001 From: Chimdi Ejiogu Date: Tue, 14 Oct 2025 03:28:20 -0400 Subject: [PATCH 07/15] Update endpoint for fetching printer information and corresponding swagger documentation --- src/swagger.json | 2 +- src/utils/EcosystemUtils.js | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/swagger.json b/src/swagger.json index ff9b0af..fc7f734 100644 --- a/src/swagger.json +++ b/src/swagger.json @@ -66,7 +66,7 @@ ], "responses": { "200": { - "description": "{\"success\": true, \"data\": [{\"id\": 1, \"location\": \"Akwe:kon\", \"description\": \"Color - Room 115\", \"latitude\": 42.4563, \"longitude\": -76.4806}]}", + "description": "{\"success\": true, \"data\": [{\"id\": 1, \"location\": \"Akwe:kon\", \"description\": \"Room 115\", \"latitude\": 42.4563, \"longitude\": -76.4806, \"labels\": [\"Color\"]}]}", "schema": { "$ref": "#/components/schemas/BusStop" } diff --git a/src/utils/EcosystemUtils.js b/src/utils/EcosystemUtils.js index 5aadd2b..a5e979a 100644 --- a/src/utils/EcosystemUtils.js +++ b/src/utils/EcosystemUtils.js @@ -45,7 +45,7 @@ function fetchAllPrinters() { }); // Fetch printers - db.all("SELECT * FROM printers", (err, rows) => { + db.all("SELECT p.id, p.location, p.description, p.latitude, p.longitude, COALESCE(GROUP_CONCAT(DISTINCT l.label, ', '), '') AS labels FROM printers p LEFT JOIN printer_labels pl ON p.id = pl.printer_id LEFT JOIN labels l ON pl.label_id = l.id GROUP BY p.id", (err, rows) => { if (err) { console.error(err.message); return reject(err); From f23bcc5726b88691265d533aabe89d6af0950b42 Mon Sep 17 00:00:00 2001 From: Chimdi Ejiogu Date: Fri, 7 Nov 2025 19:30:35 -0500 Subject: [PATCH 08/15] Add script to run migrations on database --- package-lock.json | 15 +++++ package.json | 1 + src/.DS_Store | Bin 10244 -> 8196 bytes src/data/scripts/run-migrations.js | 100 +++++++++++++++++++++++++++++ 4 files changed, 116 insertions(+) create mode 100644 src/data/scripts/run-migrations.js diff --git a/package-lock.json b/package-lock.json index 826744f..f6b0842 100644 --- a/package-lock.json +++ b/package-lock.json @@ -9,6 +9,7 @@ "version": "1.0.0", "license": "ISC", "dependencies": { + "better-sqlite3": "^12.4.1", "dotenv": "^16.4.7", "express": "^4.21.2", "firebase-admin": "^13.1.0", @@ -791,6 +792,20 @@ "tweetnacl": "^0.14.3" } }, + "node_modules/better-sqlite3": { + "version": "12.4.1", + "resolved": "https://registry.npmjs.org/better-sqlite3/-/better-sqlite3-12.4.1.tgz", + "integrity": "sha512-3yVdyZhklTiNrtg+4WqHpJpFDd+WHTg2oM7UcR80GqL05AOV0xEJzc6qNvFYoEtE+hRp1n9MpN6/+4yhlGkDXQ==", + "hasInstallScript": true, + "license": "MIT", + "dependencies": { + "bindings": "^1.5.0", + "prebuild-install": "^7.1.1" + }, + "engines": { + "node": "20.x || 22.x || 23.x || 24.x" + } + }, "node_modules/bignumber.js": { "version": "9.1.2", "resolved": "https://registry.npmjs.org/bignumber.js/-/bignumber.js-9.1.2.tgz", diff --git a/package.json b/package.json index 057904a..74e05ef 100644 --- a/package.json +++ b/package.json @@ -12,6 +12,7 @@ "author": "", "license": "ISC", "dependencies": { + "better-sqlite3": "^12.4.1", "dotenv": "^16.4.7", "express": "^4.21.2", "firebase-admin": "^13.1.0", diff --git a/src/.DS_Store b/src/.DS_Store index a39c96c47949260ec6c8cd9be8d9394560f28cf6..807e94451ef138fbec80fa0ae23bfb042a5ce57f 100644 GIT binary patch delta 96 zcmZn(XmOBWU|?W$DortDU;r^WfEYvza8E20o2aMA$gweCH$NlCWFCRdo6ie+unRH+ kWr09~8%Vf<6mBg1&ODi4C6I#=qMc!KJkPYvd}3_O03^Z?xc~qF delta 201 zcmZp1XbF&DU|?W$DortDU{C-uIe-{M3-C-V6q~50$jG%ZU^hP_*JK`n&9Zz9DGd1x z$qd;HsSHI7xM|c{%xc=|H;} zH}4VfX5ZNGflZJZ$OQrgZXn?ba>d5N@640=WdcQ*Aa2tDNi#Aq7=Y->1v0&xKMFH5 F0|3-|C*lAA diff --git a/src/data/scripts/run-migrations.js b/src/data/scripts/run-migrations.js new file mode 100644 index 0000000..25e2b1d --- /dev/null +++ b/src/data/scripts/run-migrations.js @@ -0,0 +1,100 @@ +// Imports necessary for data migrations +const fs = require('fs'); // Node's built-in file system module, which lets us read from disk +const path = require('path'); // Safer way to express file paths/path joining +const crypto = require('crypto'); +const Database = require('better-sqlite3'); + +const DB_PATH = path.join(__dirname, "../transit.db"); // Finds db file from current file's directory +const MIGRATIONS_DIR = path.join(__dirname, "../migrations"); + +/** + * Hashes a string using SHA-256 + * + * We use this to store the checksum of the migration file in the database. + * This allows us to track which migrations have been applied, as well as if a migration file has been modified since it was last applied. + * + * @param {string} s - The string to hash + * @returns {string} - The SHA-256 hash of the string + */ +function sha256(s) { + return crypto.createHash('sha256').update(s, 'utf8').digest('hex'); +} + +/** + * Runs the migrations + * + * This function reads all the migration files in the migrations directory, hashes them, and stores the checksum in the database. + * It then executes the migrations in the order of the files. + * + * @returns {void} + * @throws {Error} - If the migrations fail + */ +function runMigration() { + // Open the database using the better-sqlite3 library + const db = new Database(DB_PATH); + + // Set defaults for migrations + db.pragma('journal_mode = WAL'); + db.pragma('synchronous = NORMAL'); + db.pragma('foreign_keys = ON'); + + // Create the schema_migrations table if it doesn't exist for tracking migrations applied to the database + db.exec(` + CREATE TABLE IF NOT EXISTS schema_migrations ( + id INTEGER PRIMARY KEY, + filename TEXT NOT NULL UNIQUE, + checksum TEXT NOT NULL, + applied_at TEXT NOT NULL DEFAULT (datetime('now')) + ) + `); + + // Get the list of migrations that have already been applied to the database + const applied = new Set( + db.prepare('SELECT filename FROM schema_migrations').all().map(record => record.filename) + ); + + // Get the list of migration files in the migrations directory (keeping only .sql files and sorting them chronologically) + const files = fs.readdirSync(MIGRATIONS_DIR).filter(f => f.endsWith('.sql')).sort(); + + // Prepare the statement to insert a new migration into the schema_migrations table + const insertMig = db.prepare(` + INSERT INTO schema_migrations (filename, checksum) VALUES (?,?) + `); + + // Define a transaction to execute the migrations + const transaction = db.transaction(() => { + for (const file of files) { + // Skip if the migration has already been applied + if (applied.has(file)) { + continue; + } + + const full = path.join(MIGRATIONS_DIR, file); + const sql = fs.readFileSync(full, 'utf8').trim(); + if (!sql) { + continue; + } + + // Defensive: re-enable FKs inside each run (is already done in the migrations, but just in case) + db.exec('PRAGMA foreign_keys = ON;'); + + // Execute SQL commands in the migration file + db.exec(sql); + + // Records migration as applied to the database via its check + insertMig.run(file, sha256(sql)); + console.log(`Applied ${file}`); + } + }); + + try { + transaction(); + console.log('All migrations applied'); + } catch (e) { + console.error("Migration failed", e); + } finally { + db.close(); + } +} + +runMigration(); \ No newline at end of file From f8d95d5b4aac621931641cf5e4842da8bc22583a Mon Sep 17 00:00:00 2001 From: Chimdi Ejiogu Date: Fri, 7 Nov 2025 19:31:07 -0500 Subject: [PATCH 09/15] Add migration files to create labels and printer_label tables --- src/data/migrations/2025117_1854_create_labels.sql | 6 ++++++ .../migrations/2025117_1859_create_printer_labels.sql | 9 +++++++++ 2 files changed, 15 insertions(+) create mode 100644 src/data/migrations/2025117_1854_create_labels.sql create mode 100644 src/data/migrations/2025117_1859_create_printer_labels.sql diff --git a/src/data/migrations/2025117_1854_create_labels.sql b/src/data/migrations/2025117_1854_create_labels.sql new file mode 100644 index 0000000..3884e98 --- /dev/null +++ b/src/data/migrations/2025117_1854_create_labels.sql @@ -0,0 +1,6 @@ +PRAGMA foreign_keys = ON; + +CREATE TABLE IF NOT EXISTS labels ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + label TEXT UNIQUE NOT NULL +); \ No newline at end of file diff --git a/src/data/migrations/2025117_1859_create_printer_labels.sql b/src/data/migrations/2025117_1859_create_printer_labels.sql new file mode 100644 index 0000000..73fd9c0 --- /dev/null +++ b/src/data/migrations/2025117_1859_create_printer_labels.sql @@ -0,0 +1,9 @@ +PRAGMA foreign_keys = ON; + +CREATE TABLE IF NOT EXISTS printer_labels ( + printer_id INTEGER NOT NULL, + label_id INTEGER NOT NULL, + PRIMARY KEY (printer_id, label_id), + FOREIGN KEY (printer_id) REFERENCES printers(id) ON DELETE CASCADE, + FOREIGN KEY (label_id) REFERENCES labels(id) ON DELETE CASCADE +); \ No newline at end of file From 9fe37ff164f96a6c702f97c139d0445eb9b59656 Mon Sep 17 00:00:00 2001 From: Chimdi Ejiogu Date: Fri, 7 Nov 2025 19:31:52 -0500 Subject: [PATCH 10/15] Remove labels and printer_labels table from database initialization for migration --- src/data/db/models.py | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/src/data/db/models.py b/src/data/db/models.py index 5499c30..17db360 100644 --- a/src/data/db/models.py +++ b/src/data/db/models.py @@ -51,29 +51,6 @@ def create_tables(): ) """ ) - - # Table for storing unique labels - cursor.execute( - """ - CREATE TABLE IF NOT EXISTS labels ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - label TEXT UNIQUE NOT NULL - ) - """ - ) - - # Junction table for many-to-many relationship between printers and labels - cursor.execute( - """ - CREATE TABLE IF NOT EXISTS printer_labels ( - printer_id INTEGER NOT NULL, - label_id INTEGER NOT NULL, - PRIMARY KEY (printer_id, label_id), - FOREIGN KEY (printer_id) REFERENCES printers(id) ON DELETE CASCADE, - FOREIGN KEY (label_id) REFERENCES labels(id) ON DELETE CASCADE - ) - """ - ) conn.commit() conn.close() From 919125c48d5c4d3d4b6fdedcd960aa0752f4403e Mon Sep 17 00:00:00 2001 From: Chimdi Ejiogu Date: Fri, 7 Nov 2025 20:15:22 -0500 Subject: [PATCH 11/15] Minor bug fix --- src/data/scripts/populate_db.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/data/scripts/populate_db.py b/src/data/scripts/populate_db.py index c84cd1b..30ddc62 100644 --- a/src/data/scripts/populate_db.py +++ b/src/data/scripts/populate_db.py @@ -18,7 +18,7 @@ def populate_db(): # Insert printers printers = scrape_printers() for printer in printers: - insert_printer(printer['Location'], printer['Description'], printers['Labels'], printer['Coordinates'][0], printer['Coordinates'][1]) + insert_printer(printer['Location'], printer['Description'], printer['Labels'], printer['Coordinates'][0], printer['Coordinates'][1]) if __name__ == "__main__": populate_db() \ No newline at end of file From aa664b75f5a1ac4cc1560456d096fcd9c1ab7b16 Mon Sep 17 00:00:00 2001 From: Chimdi Ejiogu Date: Fri, 7 Nov 2025 20:29:09 -0500 Subject: [PATCH 12/15] Export script to run migrations (and populate db) --- package.json | 4 +++- src/data/scripts/run-migrations.js | 8 +++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/package.json b/package.json index 74e05ef..b4d1f5c 100644 --- a/package.json +++ b/package.json @@ -6,7 +6,9 @@ "type": "module", "scripts": { "start:dev": "nodemon --ignore src/data/notifRequests.json src/index.js", - "start": "node src/index.js" + "start": "node src/index.js", + "migrate": "node src/data/scripts/run-migrations.js", + "populate:db": "npm run migrate && python3 src/data/scripts/populate_db.py" }, "keywords": [], "author": "", diff --git a/src/data/scripts/run-migrations.js b/src/data/scripts/run-migrations.js index 25e2b1d..4748193 100644 --- a/src/data/scripts/run-migrations.js +++ b/src/data/scripts/run-migrations.js @@ -97,4 +97,10 @@ function runMigration() { } } -runMigration(); \ No newline at end of file +module.exports = { + runMigration +}; + +if (require.main === module) { + runMigration(); +} \ No newline at end of file From 58917a8995e6d27341c1a0209e0719e299532685 Mon Sep 17 00:00:00 2001 From: Chimdi Ejiogu Date: Sat, 8 Nov 2025 10:07:46 -0500 Subject: [PATCH 13/15] Fix scraping and database bugs --- src/data/db/database.py | 22 +++++++++---------- src/data/db/models.py | 3 +-- src/data/scrapers/printers.py | 41 +++++++++++++++++++++++++++-------- 3 files changed, 43 insertions(+), 23 deletions(-) diff --git a/src/data/db/database.py b/src/data/db/database.py index 7aba210..9b857b2 100644 --- a/src/data/db/database.py +++ b/src/data/db/database.py @@ -37,14 +37,18 @@ def insert_printer(location, description, labels, latitude, longitude): conn = get_db_connection() cursor = conn.cursor() + # We remove the "OR IGNORE" because we acknoledge that several printers may have the same location and description (i.e., same building and room), so we rely on the unique printer_id to identify the printer cursor.execute( """ - INSERT OR IGNORE INTO printers (location, description, latitude, longitude) + INSERT INTO printers (location, description, latitude, longitude) VALUES (?, ?, ?, ?) """, (location, description, latitude, longitude), ) - + + # To get the printer_id, we do NOT rely on the location/description/coordinates, but rather on the printer_id that was just inserted (lastrowid), as several printers may have the same location and description (i.e., same building and room) + printer_id = cursor.lastrowid + # Insert labels into the labels table and get their IDs label_ids = [] for label in labels: @@ -61,17 +65,11 @@ def insert_printer(location, description, labels, latitude, longitude): """, (label,), ) - label_id = cursor.fetchone()[0] + result = cursor.fetchone() + if result is None: + raise ValueError(f"Failed to find label: {label}") + label_id = result[0] label_ids.append(label_id) - - # Create entries in the junction table for printer-label relationships - cursor.execute( - """ - SELECT id FROM printers WHERE location = ? AND description = ? AND latitude = ? AND longitude = ? - """, - (location, description, latitude, longitude), - ) - printer_id = cursor.fetchone()[0] # Insert into junction table for label_id in label_ids: diff --git a/src/data/db/models.py b/src/data/db/models.py index 17db360..8183be9 100644 --- a/src/data/db/models.py +++ b/src/data/db/models.py @@ -15,7 +15,6 @@ def create_tables(): conn = sqlite3.connect(DB_PATH) cursor = conn.cursor() - #TODO: Remove UNIQUE constraint from location cursor.execute( """ CREATE TABLE IF NOT EXISTS libraries ( @@ -32,7 +31,7 @@ def create_tables(): """ CREATE TABLE IF NOT EXISTS printers ( id INTEGER PRIMARY KEY AUTOINCREMENT, - location TEXT UNIQUE, + location TEXT, description TEXT, latitude REAL, longitude REAL diff --git a/src/data/scrapers/printers.py b/src/data/scrapers/printers.py index a71e1f7..ea40cd6 100644 --- a/src/data/scrapers/printers.py +++ b/src/data/scrapers/printers.py @@ -1,5 +1,4 @@ import requests -from bs4 import BeautifulSoup from difflib import get_close_matches # For data scraping from difflib import SequenceMatcher import re # For using regex @@ -112,13 +111,14 @@ ), # --- Printer capabilities --- - "Color": re.compile(r"\bcolor\b", re.IGNORECASE), + "Color, Scan, & Copy": re.compile( + r"\bcolor\s*[,/&]?\s*(?:scan\s*[,/&]?\s*)?(?:and\s*)?\s*&?\s*(?:copy|print|copying)\b", re.IGNORECASE + ), "Black & White": re.compile( r"\b(?:black\s*(?:and|&)\s*white|b\s*&\s*w)\b", re.IGNORECASE ), - "Color, Scan, & Copy": re.compile( - r"\bcolor[,/ &]*(scan|copy|print|copying)+\b", re.IGNORECASE - ), + "Color": re.compile(r"\bcolor\b", re.IGNORECASE), + } # Used for stripping residual trailing labels from descriptions @@ -209,11 +209,20 @@ def map_labels(text): # Search for the pattern in the cleaned text if pattern.search(cleaned): found_labels.append(canon) + cleaned = pattern.sub("", cleaned, count=1).strip() + + # Collapse runs of punctuation-delimiters to a single space + cleaned = re.sub(r"\s*[,;/|&\-–—:]+\s*", " ", cleaned) - # Remove the found label from the text to avoid duplicates - cleaned = pattern.sub("", cleaned).strip() + # Remove any leftover leading delimiters/spaces (e.g., ", ", "- ") + cleaned = re.sub(r"^[\s,;/|&\-–—:]+", "", cleaned) + # Remove standalone "Copy", "Print", or "Scan" at the start (leftover from partial label removal) + cleaned = re.sub(r"^(?:copy|print|scan)\s+", "", cleaned, flags=re.IGNORECASE) + + # Final whitespace cleanup cleaned = re.sub(r"\s+", " ", cleaned).strip() + return cleaned, sorted(set(found_labels)) def fetch_printers_json(): @@ -242,12 +251,17 @@ def scrape_printers(): # Map raw building name to canonical building name building, _ = map_building(raw_building) + # If we weren't able to map the building to a canonical building, skip this row + # NOTE: This should prevent us from getting "None" as the location, which was happening earlier + if building not in CANONICAL_BUILDINGS: + continue + # Map labels from description to canonical labels labels = [] _, building_labels = map_labels(raw_building) # Get labels from the building name (e.g., "Residents Only") remainder, location_labels = map_labels(raw_location) # Get labels from the location description (e.g., "Landscape Architecture Student ONLY") - + # Deduplicate and sort labels labels += building_labels labels += location_labels @@ -266,4 +280,13 @@ def scrape_printers(): "Labels": labels }) - return data \ No newline at end of file + return data + +if __name__ == "__main__": + results = scrape_printers() + print(f"Scraped {len(results)} printers.\n") + + # Print a sample of the data + for row in results: + if row['Location'] == 'Vet Library': + print(row['Description'], row['Labels']) \ No newline at end of file From 8f8486c9c3e4b510a945609d1368244f63d7fe4e Mon Sep 17 00:00:00 2001 From: Chimdi Ejiogu Date: Sat, 8 Nov 2025 10:12:17 -0500 Subject: [PATCH 14/15] Fix imports --- src/data/scripts/run-migrations.js | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/src/data/scripts/run-migrations.js b/src/data/scripts/run-migrations.js index 4748193..5e499fc 100644 --- a/src/data/scripts/run-migrations.js +++ b/src/data/scripts/run-migrations.js @@ -1,10 +1,15 @@ // Imports necessary for data migrations -const fs = require('fs'); // Node's built-in file system module, which lets us read from disk -const path = require('path'); // Safer way to express file paths/path joining -const crypto = require('crypto'); -const Database = require('better-sqlite3'); +import fs from 'fs' // Node's built-in file system module, which lets us read from disk +import path from 'path'; // Safer way to express file paths/path joining +import crypto from 'crypto'; +import Database from 'better-sqlite3'; +import { fileURLToPath } from 'url'; -const DB_PATH = path.join(__dirname, "../transit.db"); // Finds db file from current file's directory +const __filename = fileURLToPath(import.meta.url); +const __dirname = path.dirname(__filename); + +// || path.join(__dirname, "../transit.db") +const DB_PATH = process.env.DB_PATH; // Finds db file from current file's directory const MIGRATIONS_DIR = path.join(__dirname, "../migrations"); /** @@ -97,10 +102,11 @@ function runMigration() { } } -module.exports = { - runMigration -}; - -if (require.main === module) { +export function runMigrations() { runMigration(); -} \ No newline at end of file +} + +import { pathToFileURL } from 'url'; +if (import.meta.url === pathToFileURL(process.argv[1]).href) { + runMigrations(); + } \ No newline at end of file From 7e062ccee33396f7cd89ab36862438af7bafb692 Mon Sep 17 00:00:00 2001 From: Chimdi Ejiogu Date: Sat, 8 Nov 2025 10:12:34 -0500 Subject: [PATCH 15/15] Add pycache --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 4163587..a8fbbdc 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ build/ logs/ node_modules/ +__pycache__/ # Specific Files config.json