From d61e5ca8bd71bca00c72cea43de44570973ab404 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 21 Nov 2025 21:13:42 +0000 Subject: [PATCH] Add Echo Rule watermark analysis for LLM learning article - Add manual_analysis.py script for watermark detection when spaCy model unavailable - Include sample article text (LLM learning research) for analysis - Generate detailed JSON analysis output with 46 clause pairs analyzed - Result: LIKELY_HUMAN verdict with 0.209 final score (below 0.45 threshold) --- data/analysis_input.txt | 27 + data/analysis_output.json | 4913 ++++++++++++++++++++++++++++++++++++ scripts/manual_analysis.py | 654 +++++ 3 files changed, 5594 insertions(+) create mode 100644 data/analysis_input.txt create mode 100644 data/analysis_output.json create mode 100644 scripts/manual_analysis.py diff --git a/data/analysis_input.txt b/data/analysis_input.txt new file mode 100644 index 0000000..b48a9d3 --- /dev/null +++ b/data/analysis_input.txt @@ -0,0 +1,27 @@ +Since the release of ChatGPT in late 2022, millions of people have started using large language models to access knowledge. And it's easy to understand their appeal: Ask a question, get a polished synthesis and move on – it feels like effortless learning. + +However, a new paper I co-authored offers experimental evidence that this ease may come at a cost: When people rely on large language models to summarize information on a topic for them, they tend to develop shallower knowledge about it compared to learning through a standard Google search. + +Co-author Jin Ho Yun and I, both professors of marketing, reported this finding in a paper based on seven studies with more than 10,000 participants. Most of the studies used the same basic paradigm: Participants were asked to learn about a topic – such as how to grow a vegetable garden – and were randomly assigned to do so by using either an LLM like ChatGPT or the "old-fashioned way," by navigating links using a standard Google search. + +No restrictions were put on how they used the tools; they could search on Google as long as they wanted and could continue to prompt ChatGPT if they felt they wanted more information. Once they completed their research, they were then asked to write advice to a friend on the topic based on what they learned. + +The data revealed a consistent pattern: People who learned about a topic through an LLM versus web search felt that they learned less, invested less effort in subsequently writing their advice, and ultimately wrote advice that was shorter, less factual and more generic. In turn, when this advice was presented to an independent sample of readers, who were unaware of which tool had been used to learn about the topic, they found the advice to be less informative, less helpful, and they were less likely to adopt it. + +We found these differences to be robust across a variety of contexts. For example, one possible reason LLM users wrote briefer and more generic advice is simply that the LLM results exposed users to less eclectic information than the Google results. To control for this possibility, we conducted an experiment where participants were exposed to an identical set of facts in the results of their Google and ChatGPT searches. Likewise, in another experiment we held constant the search platform – Google – and varied whether participants learned from standard Google results or Google's AI Overview feature. + +The findings confirmed that, even when holding the facts and platform constant, learning from synthesized LLM responses led to shallower knowledge compared to gathering, interpreting and synthesizing information for oneself via standard web links. + +Why did the use of LLMs appear to diminish learning? One of the most fundamental principles of skill development is that people learn best when they are actively engaged with the material they are trying to learn. + +When we learn about a topic through Google search, we face much more "friction": We must navigate different web links, read informational sources, and interpret and synthesize them ourselves. + +While more challenging, this friction leads to the development of a deeper, more original mental representation of the topic at hand. But with LLMs, this entire process is done on the user's behalf, transforming learning from a more active to passive process. + +To be clear, we do not believe the solution to these issues is to avoid using LLMs, especially given the undeniable benefits they offer in many contexts. Rather, our message is that people simply need to become smarter or more strategic users of LLMs – which starts by understanding the domains wherein LLMs are beneficial versus harmful to their goals. + +Need a quick, factual answer to a question? Feel free to use your favorite AI co-pilot. But if your aim is to develop deep and generalizable knowledge in an area, relying on LLM syntheses alone will be less helpful. + +As part of my research on the psychology of new technology and new media, I am also interested in whether it's possible to make LLM learning a more active process. In another experiment we tested this by having participants engage with a specialized GPT model that offered real-time web links alongside its synthesized responses. There, however, we found that once participants received an LLM summary, they weren't motivated to dig deeper into the original sources. The result was that the participants still developed shallower knowledge compared to those who used standard Google. + +Building on this, in my future research I plan to study generative AI tools that impose healthy frictions for learning tasks – specifically, examining which types of guardrails or speed bumps most successfully motivate users to actively learn more beyond easy, synthesized answers. Such tools would seem particularly critical in secondary education, where a major challenge for educators is how best to equip students to develop foundational reading, writing and math skills while also preparing for a real world where LLMs are likely to be an integral part of their daily lives. \ No newline at end of file diff --git a/data/analysis_output.json b/data/analysis_output.json new file mode 100644 index 0000000..f2b1313 --- /dev/null +++ b/data/analysis_output.json @@ -0,0 +1,4913 @@ +{ + "text_length": 5084, + "word_count": 835, + "sentence_count": 30, + "clause_pairs_found": 46, + "average_phonetic": 0.2223853235702717, + "average_structural": 0.2608695652173913, + "average_semantic": 0.1391304347826087, + "final_score": 0.20895412942810873, + "verdict": "LIKELY_HUMAN", + "confidence": 0.3910458705718912, + "reasoning": [ + "Very low echo score (0.209) - consistent with human writing" + ], + "echo_scores": [ + { + "phonetic": 0.20565464786937449, + "structural": 0.3333333333333333, + "semantic": 0.0, + "combined": 0.18226185914774978, + "details": { + "clause_a": "And it's easy to understand their appeal: Ask a qu...", + "clause_b": "it feels like effortless learning.", + "separator": "\u2013", + "zone_a": [ + "polished", + "synthesis", + "move" + ], + "zone_b": [ + "feels", + "like", + "effortless" + ], + "phonetic": { + "avg_similarity": 0.3641366196734362, + "initial_echo": 0.0, + "final_echo": 0.2, + "best_matches": [ + [ + "synthesis", + "effortless", + "S IH1 N TH AH0 S AH0 S", + "EH1 F ER0 T L AH0 S", + 0.5853658536585367 + ] + ], + "zone_a_phonetics": [ + [ + "polished", + "P AA1 L IH0 SH T" + ], + [ + "synthesis", + "S IH1 N TH AH0 S AH0 S" + ], + [ + "move", + "M UW1 V" + ] + ], + "zone_b_phonetics": [ + [ + "feels", + "F IY1 L Z" + ], + [ + "like", + "L AY1 K" + ], + [ + "effortless", + "EH1 F ER0 T L AH0 S" + ] + ] + }, + "structural": { + "pos_a": [ + "VERB", + "NOUN", + "NOUN" + ], + "pos_b": [ + "NOUN", + "NOUN", + "ADJ" + ], + "bigrams_a": [ + [ + "VERB", + "NOUN" + ], + [ + "NOUN", + "NOUN" + ] + ], + "bigrams_b": [ + [ + "NOUN", + "ADJ" + ], + [ + "NOUN", + "NOUN" + ] + ], + "matching_bigrams": [ + [ + "NOUN", + "NOUN" + ] + ], + "method": "bigram", + "similarity": 0.3333333333333333 + }, + "semantic": { + "zone_a_categories": [ + "communication" + ], + "zone_b_categories": [ + "cognitive" + ], + "matching_categories": [], + "method": "category", + "similarity": 0.0 + } + } + }, + { + "phonetic": 0.13118355155454495, + "structural": 0.5, + "semantic": 0.0, + "combined": 0.20247342062181797, + "details": { + "clause_a": "And it's easy to understand their appeal", + "clause_b": "Ask a question, get a polished synthesis and move ...", + "separator": ":", + "zone_a": [ + "easy", + "understand", + "appeal" + ], + "zone_b": [ + "ask", + "question", + "get" + ], + "phonetic": { + "avg_similarity": 0.3279588788863623, + "initial_echo": 0.0, + "final_echo": 0.0, + "best_matches": [], + "zone_a_phonetics": [ + [ + "easy", + "IY1 Z IY0" + ], + [ + "understand", + "AH2 N D ER0 S T AE1 N D" + ], + [ + "appeal", + "AH0 P IY1 L" + ] + ], + "zone_b_phonetics": [ + [ + "ask", + "AE1 S K" + ], + [ + "question", + "K W EH1 S CH AH0 N" + ], + [ + "get", + "G EH1 T" + ] + ] + }, + "structural": { + "pos_a": [ + "NOUN", + "NOUN", + "ADJ" + ], + "pos_b": [ + "NOUN", + "NOUN", + "NOUN" + ], + "bigrams_a": [ + [ + "NOUN", + "ADJ" + ], + [ + "NOUN", + "NOUN" + ] + ], + "bigrams_b": [ + [ + "NOUN", + "NOUN" + ] + ], + "matching_bigrams": [ + [ + "NOUN", + "NOUN" + ] + ], + "method": "bigram", + "similarity": 0.5 + }, + "semantic": { + "zone_a_categories": [ + "learning" + ], + "zone_b_categories": [], + "matching_categories": [], + "method": "category", + "similarity": 0.0 + } + } + }, + { + "phonetic": 0.1428395061728395, + "structural": 0.5, + "semantic": 0.2, + "combined": 0.2671358024691358, + "details": { + "clause_a": "And it's easy to underst", + "clause_b": "their appeal: Ask a question, get a polished synth...", + "separator": "and", + "zone_a": [ + "easy", + "underst" + ], + "zone_b": [ + "appeal", + "ask", + "question" + ], + "phonetic": { + "avg_similarity": 0.35709876543209873, + "initial_echo": 0.0, + "final_echo": 0.0, + "best_matches": [], + "zone_a_phonetics": [ + [ + "easy", + "IY1 Z IY0" + ] + ], + "zone_b_phonetics": [ + [ + "appeal", + "AH0 P IY1 L" + ], + [ + "ask", + "AE1 S K" + ], + [ + "question", + "K W EH1 S CH AH0 N" + ] + ] + }, + "structural": { + "pos_a": [ + "NOUN", + "NOUN" + ], + "pos_b": [ + "ADJ", + "NOUN", + "NOUN" + ], + "bigrams_a": [ + [ + "NOUN", + "NOUN" + ] + ], + "bigrams_b": [ + [ + "ADJ", + "NOUN" + ], + [ + "NOUN", + "NOUN" + ] + ], + "matching_bigrams": [ + [ + "NOUN", + "NOUN" + ] + ], + "method": "bigram", + "similarity": 0.5 + }, + "semantic": { + "method": "no_overlap", + "note": "No semantic data" + } + } + }, + { + "phonetic": 0.13232054479672606, + "structural": 0.0, + "semantic": 0.0, + "combined": 0.05292821791869043, + "details": { + "clause_a": "And it's easy to understand their appeal: Ask a qu...", + "clause_b": "move on \u2013 it feels like effortless learning.", + "separator": "and", + "zone_a": [ + "get", + "polished", + "synthesis" + ], + "zone_b": [ + "move", + "feels", + "like" + ], + "phonetic": { + "avg_similarity": 0.33080136199181515, + "initial_echo": 0.0, + "final_echo": 0.0, + "best_matches": [], + "zone_a_phonetics": [ + [ + "get", + "G EH1 T" + ], + [ + "polished", + "P AA1 L IH0 SH T" + ], + [ + "synthesis", + "S IH1 N TH AH0 S AH0 S" + ] + ], + "zone_b_phonetics": [ + [ + "move", + "M UW1 V" + ], + [ + "feels", + "F IY1 L Z" + ], + [ + "like", + "L AY1 K" + ] + ] + }, + "structural": { + "pos_a": [ + "NOUN", + "VERB", + "NOUN" + ], + "pos_b": [ + "NOUN", + "NOUN", + "NOUN" + ], + "bigrams_a": [ + [ + "NOUN", + "VERB" + ], + [ + "VERB", + "NOUN" + ] + ], + "bigrams_b": [ + [ + "NOUN", + "NOUN" + ] + ], + "matching_bigrams": [], + "method": "bigram", + "similarity": 0.0 + }, + "semantic": { + "zone_a_categories": [ + "communication" + ], + "zone_b_categories": [], + "matching_categories": [], + "method": "category", + "similarity": 0.0 + } + } + }, + { + "phonetic": 0.1593443551441488, + "structural": 0.0, + "semantic": 0.2, + "combined": 0.12373774205765951, + "details": { + "clause_a": "However, a new paper I co-authored offers experime...", + "clause_b": "When people rely on large language models to summa...", + "separator": ":", + "zone_a": [ + "ease", + "come", + "cost" + ], + "zone_b": [ + "people", + "rely", + "large" + ], + "phonetic": { + "avg_similarity": 0.39836088786037194, + "initial_echo": 0.0, + "final_echo": 0.0, + "best_matches": [ + [ + "cost", + "large", + "K AA1 S T", + "L AA1 R JH", + 0.631578947368421 + ] + ], + "zone_a_phonetics": [ + [ + "ease", + "IY1 Z" + ], + [ + "come", + "K AH1 M" + ], + [ + "cost", + "K AA1 S T" + ] + ], + "zone_b_phonetics": [ + [ + "people", + "P IY1 P AH0 L" + ], + [ + "rely", + "R IH0 L AY1" + ], + [ + "large", + "L AA1 R JH" + ] + ] + }, + "structural": { + "pos_a": [ + "NOUN", + "NOUN", + "NOUN" + ], + "pos_b": [ + "NOUN", + "ADV", + "NOUN" + ], + "bigrams_a": [ + [ + "NOUN", + "NOUN" + ] + ], + "bigrams_b": [ + [ + "NOUN", + "ADV" + ], + [ + "ADV", + "NOUN" + ] + ], + "matching_bigrams": [], + "method": "bigram", + "similarity": 0.0 + }, + "semantic": { + "method": "no_overlap", + "note": "No semantic data" + } + } + }, + { + "phonetic": 0.25250057307528573, + "structural": 0.0, + "semantic": 0.0, + "combined": 0.1010002292301143, + "details": { + "clause_a": "However, a new paper I co-authored offers experime...", + "clause_b": "them, they tend to develop shallower knowledge abo...", + "separator": "or", + "zone_a": [ + "summarize", + "information", + "topic" + ], + "zone_b": [ + "tend", + "develop", + "shallower" + ], + "phonetic": { + "avg_similarity": 0.44375143268821426, + "initial_echo": 0.25, + "final_echo": 0.0, + "best_matches": [ + [ + "summarize", + "shallower", + "S AH1 M ER0 AY2 Z", + "SH AE1 L OW0 ER0", + 0.5454545454545454 + ], + [ + "information", + "develop", + "IH2 N F ER0 M EY1 SH AH0 N", + "D IH0 V EH1 L AH0 P", + 0.5777777777777777 + ] + ], + "zone_a_phonetics": [ + [ + "summarize", + "S AH1 M ER0 AY2 Z" + ], + [ + "information", + "IH2 N F ER0 M EY1 SH AH0 N" + ], + [ + "topic", + "T AA1 P IH0 K" + ] + ], + "zone_b_phonetics": [ + [ + "tend", + "T EH1 N D" + ], + [ + "develop", + "D IH0 V EH1 L AH0 P" + ], + [ + "shallower", + "SH AE1 L OW0 ER0" + ] + ] + }, + "structural": { + "pos_a": [ + "VERB", + "NOUN", + "ADJ" + ], + "pos_b": [ + "NOUN", + "NOUN", + "NOUN" + ], + "bigrams_a": [ + [ + "NOUN", + "ADJ" + ], + [ + "VERB", + "NOUN" + ] + ], + "bigrams_b": [ + [ + "NOUN", + "NOUN" + ] + ], + "matching_bigrams": [], + "method": "bigram", + "similarity": 0.0 + }, + "semantic": { + "zone_a_categories": [ + "learning" + ], + "zone_b_categories": [ + "comparison" + ], + "matching_categories": [], + "method": "category", + "similarity": 0.0 + } + } + }, + { + "phonetic": 0.14924572276073947, + "structural": 0.0, + "semantic": 0.2, + "combined": 0.11969828910429578, + "details": { + "clause_a": "Co-author Jin Ho Yun", + "clause_b": "I, both professors of marketing, reported this fin...", + "separator": "and", + "zone_a": [ + "author", + "jin", + "yun" + ], + "zone_b": [ + "professors", + "marketing", + "reported" + ], + "phonetic": { + "avg_similarity": 0.37311430690184866, + "initial_echo": 0.0, + "final_echo": 0.0, + "best_matches": [], + "zone_a_phonetics": [ + [ + "author", + "AO1 TH ER0" + ], + [ + "jin", + "JH IH1 N" + ], + [ + "yun", + "Y AH1 N" + ] + ], + "zone_b_phonetics": [ + [ + "professors", + "P R AH0 F EH1 S ER0 Z" + ], + [ + "marketing", + "M AA1 R K AH0 T IH0 NG" + ], + [ + "reported", + "R IY2 P AO1 R T IH0 D" + ] + ] + }, + "structural": { + "pos_a": [ + "NOUN", + "NOUN", + "NOUN" + ], + "pos_b": [ + "NOUN", + "VERB", + "VERB" + ], + "bigrams_a": [ + [ + "NOUN", + "NOUN" + ] + ], + "bigrams_b": [ + [ + "VERB", + "VERB" + ], + [ + "NOUN", + "VERB" + ] + ], + "matching_bigrams": [], + "method": "bigram", + "similarity": 0.0 + }, + "semantic": { + "method": "no_overlap", + "note": "No semantic data" + } + } + }, + { + "phonetic": 0.14921409657783408, + "structural": 0.3333333333333333, + "semantic": 0.0, + "combined": 0.1596856386311336, + "details": { + "clause_a": "Most of the studies used the same basic paradigm: ...", + "clause_b": "such as how to grow a vegetable garden", + "separator": "\u2013", + "zone_a": [ + "asked", + "learn", + "topic" + ], + "zone_b": [ + "grow", + "vegetable", + "garden" + ], + "phonetic": { + "avg_similarity": 0.37303524144458516, + "initial_echo": 0.0, + "final_echo": 0.0, + "best_matches": [ + [ + "topic", + "garden", + "T AA1 P IH0 K", + "G AA1 R D AH0 N", + 0.6428571428571428 + ] + ], + "zone_a_phonetics": [ + [ + "asked", + "AE1 S K T" + ], + [ + "learn", + "L ER1 N" + ], + [ + "topic", + "T AA1 P IH0 K" + ] + ], + "zone_b_phonetics": [ + [ + "grow", + "G R OW1" + ], + [ + "vegetable", + "V EH1 JH T AH0 B AH0 L" + ], + [ + "garden", + "G AA1 R D AH0 N" + ] + ] + }, + "structural": { + "pos_a": [ + "VERB", + "NOUN", + "ADJ" + ], + "pos_b": [ + "NOUN", + "ADJ", + "NOUN" + ], + "bigrams_a": [ + [ + "NOUN", + "ADJ" + ], + [ + "VERB", + "NOUN" + ] + ], + "bigrams_b": [ + [ + "NOUN", + "ADJ" + ], + [ + "ADJ", + "NOUN" + ] + ], + "matching_bigrams": [ + [ + "NOUN", + "ADJ" + ] + ], + "method": "bigram", + "similarity": 0.3333333333333333 + }, + "semantic": { + "zone_a_categories": [ + "learning" + ], + "zone_b_categories": [], + "matching_categories": [], + "method": "category", + "similarity": 0.0 + } + } + }, + { + "phonetic": 0.16615267117271929, + "structural": 0.0, + "semantic": 0.2, + "combined": 0.12646106846908772, + "details": { + "clause_a": "such as how to grow a vegetable garden", + "clause_b": "and were randomly assigned to do so by using eithe...", + "separator": "\u2013", + "zone_a": [ + "grow", + "vegetable", + "garden" + ], + "zone_b": [ + "randomly", + "assigned", + "using" + ], + "phonetic": { + "avg_similarity": 0.4153816779317982, + "initial_echo": 0.0, + "final_echo": 0.0, + "best_matches": [ + [ + "vegetable", + "randomly", + "V EH1 JH T AH0 B AH0 L", + "R AE1 N D AH0 M L IY0", + 0.5581395348837209 + ], + [ + "garden", + "randomly", + "G AA1 R D AH0 N", + "R AE1 N D AH0 M L IY0", + 0.6111111111111112 + ], + [ + "garden", + "using", + "G AA1 R D AH0 N", + "Y UW1 Z IH0 NG", + 0.5517241379310345 + ] + ], + "zone_a_phonetics": [ + [ + "grow", + "G R OW1" + ], + [ + "vegetable", + "V EH1 JH T AH0 B AH0 L" + ], + [ + "garden", + "G AA1 R D AH0 N" + ] + ], + "zone_b_phonetics": [ + [ + "randomly", + "R AE1 N D AH0 M L IY0" + ], + [ + "assigned", + "AH0 S AY1 N D" + ], + [ + "using", + "Y UW1 Z IH0 NG" + ] + ] + }, + "structural": { + "pos_a": [ + "NOUN", + "ADJ", + "NOUN" + ], + "pos_b": [ + "ADV", + "VERB", + "VERB" + ], + "bigrams_a": [ + [ + "NOUN", + "ADJ" + ], + [ + "ADJ", + "NOUN" + ] + ], + "bigrams_b": [ + [ + "ADV", + "VERB" + ], + [ + "VERB", + "VERB" + ] + ], + "matching_bigrams": [], + "method": "bigram", + "similarity": 0.0 + }, + "semantic": { + "method": "no_overlap", + "note": "No semantic data" + } + } + }, + { + "phonetic": 0.22621152921152926, + "structural": 0.0, + "semantic": 0.0, + "combined": 0.09048461168461171, + "details": { + "clause_a": "Most of the studies used the same basic paradigm", + "clause_b": "Participants were asked to learn about a topic \u2013 s...", + "separator": ":", + "zone_a": [ + "used", + "basic", + "paradigm" + ], + "zone_b": [ + "participants", + "asked", + "learn" + ], + "phonetic": { + "avg_similarity": 0.37802882302882307, + "initial_echo": 0.25, + "final_echo": 0.0, + "best_matches": [ + [ + "basic", + "asked", + "B EY1 S IH0 K", + "AE1 S K T", + 0.5454545454545454 + ], + [ + "paradigm", + "participants", + "P EH1 R AH0 D AY2 M", + "P AA0 R T IH1 S AH0 P AH0 N T S", + 0.52 + ] + ], + "zone_a_phonetics": [ + [ + "used", + "Y UW1 Z D" + ], + [ + "basic", + "B EY1 S IH0 K" + ], + [ + "paradigm", + "P EH1 R AH0 D AY2 M" + ] + ], + "zone_b_phonetics": [ + [ + "participants", + "P AA0 R T IH1 S AH0 P AH0 N T S" + ], + [ + "asked", + "AE1 S K T" + ], + [ + "learn", + "L ER1 N" + ] + ] + }, + "structural": { + "pos_a": [ + "VERB", + "ADJ", + "NOUN" + ], + "pos_b": [ + "NOUN", + "VERB", + "NOUN" + ], + "bigrams_a": [ + [ + "ADJ", + "NOUN" + ], + [ + "VERB", + "ADJ" + ] + ], + "bigrams_b": [ + [ + "NOUN", + "VERB" + ], + [ + "VERB", + "NOUN" + ] + ], + "matching_bigrams": [], + "method": "bigram", + "similarity": 0.0 + }, + "semantic": { + "zone_a_categories": [], + "zone_b_categories": [ + "research", + "learning" + ], + "matching_categories": [], + "method": "category", + "similarity": 0.0 + } + } + }, + { + "phonetic": 0.16615267117271929, + "structural": 0.0, + "semantic": 0.2, + "combined": 0.12646106846908772, + "details": { + "clause_a": "Most of the studies used the same basic paradigm: ...", + "clause_b": "were randomly assigned to do so by using either an...", + "separator": "and", + "zone_a": [ + "grow", + "vegetable", + "garden" + ], + "zone_b": [ + "randomly", + "assigned", + "using" + ], + "phonetic": { + "avg_similarity": 0.4153816779317982, + "initial_echo": 0.0, + "final_echo": 0.0, + "best_matches": [ + [ + "vegetable", + "randomly", + "V EH1 JH T AH0 B AH0 L", + "R AE1 N D AH0 M L IY0", + 0.5581395348837209 + ], + [ + "garden", + "randomly", + "G AA1 R D AH0 N", + "R AE1 N D AH0 M L IY0", + 0.6111111111111112 + ], + [ + "garden", + "using", + "G AA1 R D AH0 N", + "Y UW1 Z IH0 NG", + 0.5517241379310345 + ] + ], + "zone_a_phonetics": [ + [ + "grow", + "G R OW1" + ], + [ + "vegetable", + "V EH1 JH T AH0 B AH0 L" + ], + [ + "garden", + "G AA1 R D AH0 N" + ] + ], + "zone_b_phonetics": [ + [ + "randomly", + "R AE1 N D AH0 M L IY0" + ], + [ + "assigned", + "AH0 S AY1 N D" + ], + [ + "using", + "Y UW1 Z IH0 NG" + ] + ] + }, + "structural": { + "pos_a": [ + "NOUN", + "ADJ", + "NOUN" + ], + "pos_b": [ + "ADV", + "VERB", + "VERB" + ], + "bigrams_a": [ + [ + "NOUN", + "ADJ" + ], + [ + "ADJ", + "NOUN" + ] + ], + "bigrams_b": [ + [ + "ADV", + "VERB" + ], + [ + "VERB", + "VERB" + ] + ], + "matching_bigrams": [], + "method": "bigram", + "similarity": 0.0 + }, + "semantic": { + "method": "no_overlap", + "note": "No semantic data" + } + } + }, + { + "phonetic": 0.15113871635610768, + "structural": 0.0, + "semantic": 0.0, + "combined": 0.06045548654244307, + "details": { + "clause_a": "Most of the studies used the same basic paradigm: ...", + "clause_b": "the \"old-fashioned way,\" by navigating links using...", + "separator": "or", + "zone_a": [ + "llm", + "like", + "chatgpt" + ], + "zone_b": [ + "old", + "fashioned", + "way" + ], + "phonetic": { + "avg_similarity": 0.3778467908902692, + "initial_echo": 0.0, + "final_echo": 0.0, + "best_matches": [], + "zone_a_phonetics": [ + [ + "like", + "L AY1 K" + ] + ], + "zone_b_phonetics": [ + [ + "old", + "OW1 L D" + ], + [ + "fashioned", + "F AE1 SH AH0 N D" + ], + [ + "way", + "W EY1" + ] + ] + }, + "structural": { + "pos_a": [ + "NOUN", + "NOUN", + "NOUN" + ], + "pos_b": [ + "NOUN", + "VERB", + "NOUN" + ], + "bigrams_a": [ + [ + "NOUN", + "NOUN" + ] + ], + "bigrams_b": [ + [ + "NOUN", + "VERB" + ], + [ + "VERB", + "NOUN" + ] + ], + "matching_bigrams": [], + "method": "bigram", + "similarity": 0.0 + }, + "semantic": { + "zone_a_categories": [ + "technology", + "technology" + ], + "zone_b_categories": [], + "matching_categories": [], + "method": "category", + "similarity": 0.0 + } + } + }, + { + "phonetic": 0.16456327985739752, + "structural": 0.0, + "semantic": 1.0, + "combined": 0.365825311942959, + "details": { + "clause_a": "No restrictions were put on how they used the tool...", + "clause_b": "they could search on Google as long as they wanted...", + "separator": ";", + "zone_a": [ + "put", + "used", + "tools" + ], + "zone_b": [ + "search", + "google", + "long" + ], + "phonetic": { + "avg_similarity": 0.4114081996434938, + "initial_echo": 0.0, + "final_echo": 0.0, + "best_matches": [ + [ + "used", + "google", + "Y UW1 Z D", + "G UW1 G AH0 L", + 0.5454545454545454 + ], + [ + "tools", + "google", + "T UW1 L Z", + "G UW1 G AH0 L", + 0.5454545454545454 + ] + ], + "zone_a_phonetics": [ + [ + "put", + "P UH1 T" + ], + [ + "used", + "Y UW1 Z D" + ], + [ + "tools", + "T UW1 L Z" + ] + ], + "zone_b_phonetics": [ + [ + "search", + "S ER1 CH" + ], + [ + "google", + "G UW1 G AH0 L" + ], + [ + "long", + "L AO1 NG" + ] + ] + }, + "structural": { + "pos_a": [ + "NOUN", + "VERB", + "NOUN" + ], + "pos_b": [ + "NOUN", + "NOUN", + "NOUN" + ], + "bigrams_a": [ + [ + "NOUN", + "VERB" + ], + [ + "VERB", + "NOUN" + ] + ], + "bigrams_b": [ + [ + "NOUN", + "NOUN" + ] + ], + "matching_bigrams": [], + "method": "bigram", + "similarity": 0.0 + }, + "semantic": { + "zone_a_categories": [ + "technology" + ], + "zone_b_categories": [ + "technology", + "technology" + ], + "matching_categories": [ + "technology" + ], + "method": "category", + "similarity": 1.0 + } + } + }, + { + "phonetic": 0.159242665530698, + "structural": 0.5, + "semantic": 1.0, + "combined": 0.5136970662122792, + "details": { + "clause_a": "No restrictions were put on how they used the tool...", + "clause_b": "could continue to prompt ChatGPT if they felt they...", + "separator": "and", + "zone_a": [ + "google", + "long", + "wanted" + ], + "zone_b": [ + "continue", + "prompt", + "chatgpt" + ], + "phonetic": { + "avg_similarity": 0.398106663826745, + "initial_echo": 0.0, + "final_echo": 0.0, + "best_matches": [ + [ + "wanted", + "continue", + "W AO1 N T IH0 D", + "K AH0 N T IH1 N Y UW0", + 0.5555555555555556 + ] + ], + "zone_a_phonetics": [ + [ + "google", + "G UW1 G AH0 L" + ], + [ + "long", + "L AO1 NG" + ], + [ + "wanted", + "W AO1 N T IH0 D" + ] + ], + "zone_b_phonetics": [ + [ + "continue", + "K AH0 N T IH1 N Y UW0" + ], + [ + "prompt", + "P R AA1 M P T" + ] + ] + }, + "structural": { + "pos_a": [ + "NOUN", + "NOUN", + "VERB" + ], + "pos_b": [ + "NOUN", + "NOUN", + "NOUN" + ], + "bigrams_a": [ + [ + "NOUN", + "VERB" + ], + [ + "NOUN", + "NOUN" + ] + ], + "bigrams_b": [ + [ + "NOUN", + "NOUN" + ] + ], + "matching_bigrams": [ + [ + "NOUN", + "NOUN" + ] + ], + "method": "bigram", + "similarity": 0.5 + }, + "semantic": { + "zone_a_categories": [ + "technology" + ], + "zone_b_categories": [ + "technology" + ], + "matching_categories": [ + "technology" + ], + "method": "category", + "similarity": 1.0 + } + } + }, + { + "phonetic": 0.23434239880060312, + "structural": 0.0, + "semantic": 0.0, + "combined": 0.09373695952024125, + "details": { + "clause_a": "The data revealed a consistent pattern", + "clause_b": "People who learned about a topic through an LLM ve...", + "separator": ":", + "zone_a": [ + "revealed", + "consistent", + "pattern" + ], + "zone_b": [ + "people", + "learned", + "topic" + ], + "phonetic": { + "avg_similarity": 0.4358559970015078, + "initial_echo": 0.2, + "final_echo": 0.0, + "best_matches": [ + [ + "pattern", + "people", + "P AE1 T ER0 N", + "P IY1 P AH0 L", + 0.5384615384615384 + ], + [ + "pattern", + "topic", + "P AE1 T ER0 N", + "T AA1 P IH0 K", + 0.5384615384615384 + ] + ], + "zone_a_phonetics": [ + [ + "revealed", + "R IH0 V IY1 L D" + ], + [ + "consistent", + "K AH0 N S IH1 S T AH0 N T" + ], + [ + "pattern", + "P AE1 T ER0 N" + ] + ], + "zone_b_phonetics": [ + [ + "people", + "P IY1 P AH0 L" + ], + [ + "learned", + "L ER1 N D" + ], + [ + "topic", + "T AA1 P IH0 K" + ] + ] + }, + "structural": { + "pos_a": [ + "VERB", + "NOUN", + "NOUN" + ], + "pos_b": [ + "NOUN", + "VERB", + "ADJ" + ], + "bigrams_a": [ + [ + "VERB", + "NOUN" + ], + [ + "NOUN", + "NOUN" + ] + ], + "bigrams_b": [ + [ + "NOUN", + "VERB" + ], + [ + "VERB", + "ADJ" + ] + ], + "matching_bigrams": [], + "method": "bigram", + "similarity": 0.0 + }, + "semantic": { + "zone_a_categories": [], + "zone_b_categories": [ + "learning" + ], + "matching_categories": [], + "method": "category", + "similarity": 0.0 + } + } + }, + { + "phonetic": 0.48609018445275753, + "structural": 0.0, + "semantic": 1.0, + "combined": 0.494436073781103, + "details": { + "clause_a": "The data revealed a consistent pattern: People who...", + "clause_b": "ultimately wrote advice that was shorter, less fac...", + "separator": "and", + "zone_a": [ + "subsequently", + "writing", + "advice" + ], + "zone_b": [ + "ultimately", + "wrote", + "advice" + ], + "phonetic": { + "avg_similarity": 0.4652254611318939, + "initial_echo": 0.5, + "final_echo": 0.5, + "best_matches": [ + [ + "subsequently", + "ultimately", + "S AH1 B S AH0 K W AH0 N T L IY0", + "AH1 L T AH0 M AH0 T L IY0", + 0.7857142857142857 + ], + [ + "advice", + "advice", + "AE0 D V AY1 S", + "AE0 D V AY1 S", + 1.0 + ] + ], + "zone_a_phonetics": [ + [ + "subsequently", + "S AH1 B S AH0 K W AH0 N T L IY0" + ], + [ + "writing", + "R AY1 T IH0 NG" + ], + [ + "advice", + "AE0 D V AY1 S" + ] + ], + "zone_b_phonetics": [ + [ + "ultimately", + "AH1 L T AH0 M AH0 T L IY0" + ], + [ + "wrote", + "R OW1 T" + ], + [ + "advice", + "AE0 D V AY1 S" + ] + ] + }, + "structural": { + "pos_a": [ + "ADV", + "VERB", + "NOUN" + ], + "pos_b": [ + "ADV", + "NOUN", + "NOUN" + ], + "bigrams_a": [ + [ + "ADV", + "VERB" + ], + [ + "VERB", + "NOUN" + ] + ], + "bigrams_b": [ + [ + "ADV", + "NOUN" + ], + [ + "NOUN", + "NOUN" + ] + ], + "matching_bigrams": [], + "method": "bigram", + "similarity": 0.0 + }, + "semantic": { + "zone_a_categories": [ + "communication" + ], + "zone_b_categories": [ + "communication" + ], + "matching_categories": [ + "communication" + ], + "method": "category", + "similarity": 1.0 + } + } + }, + { + "phonetic": 0.17879448075526508, + "structural": 0.5, + "semantic": 0.0, + "combined": 0.22151779230210603, + "details": { + "clause_a": "The data revealed a consistent pattern: People who...", + "clause_b": "more generic.", + "separator": "and", + "zone_a": [ + "shorter", + "less", + "factual" + ], + "zone_b": [ + "generic" + ], + "phonetic": { + "avg_similarity": 0.4469862018881627, + "initial_echo": 0.0, + "final_echo": 0.0, + "best_matches": [], + "zone_a_phonetics": [ + [ + "shorter", + "SH AO1 R T ER0" + ], + [ + "less", + "L EH1 S" + ], + [ + "factual", + "F AE1 K CH UW0 AH0 L" + ] + ], + "zone_b_phonetics": [ + [ + "generic", + "JH AH0 N EH1 R IH0 K" + ] + ] + }, + "structural": { + "pos_a": [ + "NOUN", + "ADJ", + "ADJ" + ], + "pos_b": [ + "ADJ" + ], + "method": "unigram", + "similarity": 0.5 + }, + "semantic": { + "zone_a_categories": [ + "comparison" + ], + "zone_b_categories": [], + "matching_categories": [], + "method": "category", + "similarity": 0.0 + } + } + }, + { + "phonetic": 0.38114062829432854, + "structural": 0.0, + "semantic": 1.0, + "combined": 0.45245625131773143, + "details": { + "clause_a": "In turn, when this advice was presented to an inde...", + "clause_b": "they were less likely to adopt it.", + "separator": "and", + "zone_a": [ + "informative", + "less", + "helpful" + ], + "zone_b": [ + "less", + "likely", + "adopt" + ], + "phonetic": { + "avg_similarity": 0.42785157073582125, + "initial_echo": 0.5, + "final_echo": 0.2, + "best_matches": [ + [ + "less", + "less", + "L EH1 S", + "L EH1 S", + 1.0 + ] + ], + "zone_a_phonetics": [ + [ + "informative", + "IH2 N F AO1 R M AH0 T IH0 V" + ], + [ + "less", + "L EH1 S" + ], + [ + "helpful", + "HH EH1 L P F AH0 L" + ] + ], + "zone_b_phonetics": [ + [ + "less", + "L EH1 S" + ], + [ + "likely", + "L AY1 K L IY0" + ], + [ + "adopt", + "AH0 D AA1 P T" + ] + ] + }, + "structural": { + "pos_a": [ + "ADJ", + "ADJ", + "ADJ" + ], + "pos_b": [ + "ADJ", + "ADV", + "NOUN" + ], + "bigrams_a": [ + [ + "ADJ", + "ADJ" + ] + ], + "bigrams_b": [ + [ + "ADJ", + "ADV" + ], + [ + "ADV", + "NOUN" + ] + ], + "matching_bigrams": [], + "method": "bigram", + "similarity": 0.0 + }, + "semantic": { + "zone_a_categories": [ + "comparison" + ], + "zone_b_categories": [ + "comparison" + ], + "matching_categories": [ + "comparison" + ], + "method": "category", + "similarity": 1.0 + } + } + }, + { + "phonetic": 0.1360377893711227, + "structural": 0.0, + "semantic": 0.0, + "combined": 0.05441511574844909, + "details": { + "clause_a": "For example, one possible reason LLM users wrote b...", + "clause_b": "more generic advice is simply that the LLM results...", + "separator": "and", + "zone_a": [ + "users", + "wrote", + "briefer" + ], + "zone_b": [ + "generic", + "advice", + "simply" + ], + "phonetic": { + "avg_similarity": 0.34009447342780674, + "initial_echo": 0.0, + "final_echo": 0.0, + "best_matches": [], + "zone_a_phonetics": [ + [ + "users", + "Y UW1 Z ER0 Z" + ], + [ + "wrote", + "R OW1 T" + ], + [ + "briefer", + "B R IY1 F ER0" + ] + ], + "zone_b_phonetics": [ + [ + "generic", + "JH AH0 N EH1 R IH0 K" + ], + [ + "advice", + "AE0 D V AY1 S" + ], + [ + "simply", + "S IH1 M P L IY0" + ] + ] + }, + "structural": { + "pos_a": [ + "NOUN", + "NOUN", + "NOUN" + ], + "pos_b": [ + "ADJ", + "NOUN", + "ADV" + ], + "bigrams_a": [ + [ + "NOUN", + "NOUN" + ] + ], + "bigrams_b": [ + [ + "NOUN", + "ADV" + ], + [ + "ADJ", + "NOUN" + ] + ], + "matching_bigrams": [], + "method": "bigram", + "similarity": 0.0 + }, + "semantic": { + "zone_a_categories": [], + "zone_b_categories": [ + "communication" + ], + "matching_categories": [], + "method": "category", + "similarity": 0.0 + } + } + }, + { + "phonetic": 0.32768115942028986, + "structural": 0.5, + "semantic": 0.0, + "combined": 0.28107246376811595, + "details": { + "clause_a": "To control f", + "clause_b": "this possibility, we conducted an experiment where...", + "separator": "or", + "zone_a": [ + "control" + ], + "zone_b": [ + "possibility", + "conducted", + "experiment" + ], + "phonetic": { + "avg_similarity": 0.44420289855072465, + "initial_echo": 0.5, + "final_echo": 0.0, + "best_matches": [ + [ + "control", + "conducted", + "K AH0 N T R OW1 L", + "K AH0 N D AH1 K T AH0 D", + 0.55 + ] + ], + "zone_a_phonetics": [ + [ + "control", + "K AH0 N T R OW1 L" + ] + ], + "zone_b_phonetics": [ + [ + "possibility", + "P AA2 S AH0 B IH1 L AH0 T IY2" + ], + [ + "conducted", + "K AH0 N D AH1 K T AH0 D" + ], + [ + "experiment", + "IH0 K S P EH1 R AH0 M AH0 N T" + ] + ] + }, + "structural": { + "pos_a": [ + "NOUN" + ], + "pos_b": [ + "NOUN", + "VERB", + "NOUN" + ], + "method": "unigram", + "similarity": 0.5 + }, + "semantic": { + "zone_a_categories": [], + "zone_b_categories": [ + "research" + ], + "matching_categories": [], + "method": "category", + "similarity": 0.0 + } + } + }, + { + "phonetic": 0.1933508562325767, + "structural": 1.0, + "semantic": 0.5, + "combined": 0.5273403424930306, + "details": { + "clause_a": "To control for this possibility, we conducted an e...", + "clause_b": "ChatGPT searches.", + "separator": "and", + "zone_a": [ + "facts", + "results", + "google" + ], + "zone_b": [ + "chatgpt", + "searches" + ], + "phonetic": { + "avg_similarity": 0.4833771405814417, + "initial_echo": 0.0, + "final_echo": 0.0, + "best_matches": [ + [ + "google", + "searches", + "G UW1 G AH0 L", + "S ER1 CH IH0 Z", + 0.5185185185185186 + ] + ], + "zone_a_phonetics": [ + [ + "facts", + "F AE1 K T S" + ], + [ + "results", + "R IH0 Z AH1 L T S" + ], + [ + "google", + "G UW1 G AH0 L" + ] + ], + "zone_b_phonetics": [ + [ + "searches", + "S ER1 CH IH0 Z" + ] + ] + }, + "structural": { + "pos_a": [ + "NOUN", + "NOUN", + "NOUN" + ], + "pos_b": [ + "NOUN", + "NOUN" + ], + "bigrams_a": [ + [ + "NOUN", + "NOUN" + ] + ], + "bigrams_b": [ + [ + "NOUN", + "NOUN" + ] + ], + "matching_bigrams": [ + [ + "NOUN", + "NOUN" + ] + ], + "method": "bigram", + "similarity": 1.0 + }, + "semantic": { + "zone_a_categories": [ + "research", + "technology" + ], + "zone_b_categories": [ + "technology", + "technology" + ], + "matching_categories": [ + "technology" + ], + "method": "category", + "similarity": 0.5 + } + } + }, + { + "phonetic": 0.2166504105213783, + "structural": 0.5, + "semantic": 0.0, + "combined": 0.23666016420855132, + "details": { + "clause_a": "Likewise, in another experiment we held constant t...", + "clause_b": "varied whether participants learned from standard ...", + "separator": "and", + "zone_a": [ + "search", + "platform", + "google" + ], + "zone_b": [ + "varied", + "whether", + "participants" + ], + "phonetic": { + "avg_similarity": 0.3916260263034457, + "initial_echo": 0.2, + "final_echo": 0.0, + "best_matches": [], + "zone_a_phonetics": [ + [ + "search", + "S ER1 CH" + ], + [ + "platform", + "P L AE1 T F AO2 R M" + ], + [ + "google", + "G UW1 G AH0 L" + ] + ], + "zone_b_phonetics": [ + [ + "varied", + "V EH1 R IY0 D" + ], + [ + "whether", + "W EH1 DH ER0" + ], + [ + "participants", + "P AA0 R T IH1 S AH0 P AH0 N T S" + ] + ] + }, + "structural": { + "pos_a": [ + "NOUN", + "NOUN", + "NOUN" + ], + "pos_b": [ + "VERB", + "NOUN", + "NOUN" + ], + "bigrams_a": [ + [ + "NOUN", + "NOUN" + ] + ], + "bigrams_b": [ + [ + "VERB", + "NOUN" + ], + [ + "NOUN", + "NOUN" + ] + ], + "matching_bigrams": [ + [ + "NOUN", + "NOUN" + ] + ], + "method": "bigram", + "similarity": 0.5 + }, + "semantic": { + "zone_a_categories": [ + "technology", + "technology" + ], + "zone_b_categories": [ + "research" + ], + "matching_categories": [], + "method": "category", + "similarity": 0.0 + } + } + }, + { + "phonetic": 0.3216726767335286, + "structural": 1.0, + "semantic": 0.5, + "combined": 0.5786690706934114, + "details": { + "clause_a": "Likewise, in another experiment we held constant t...", + "clause_b": "Google's AI Overview feature.", + "separator": "or", + "zone_a": [ + "standard", + "google", + "results" + ], + "zone_b": [ + "google", + "overview", + "feature" + ], + "phonetic": { + "avg_similarity": 0.4666816918338216, + "initial_echo": 0.25, + "final_echo": 0.2, + "best_matches": [ + [ + "google", + "google", + "G UW1 G AH0 L", + "G UW1 G AH0 L", + 1.0 + ] + ], + "zone_a_phonetics": [ + [ + "standard", + "S T AE1 N D ER0 D" + ], + [ + "google", + "G UW1 G AH0 L" + ], + [ + "results", + "R IH0 Z AH1 L T S" + ] + ], + "zone_b_phonetics": [ + [ + "google", + "G UW1 G AH0 L" + ], + [ + "overview", + "OW1 V ER0 V Y UW2" + ], + [ + "feature", + "F IY1 CH ER0" + ] + ] + }, + "structural": { + "pos_a": [ + "NOUN", + "NOUN", + "NOUN" + ], + "pos_b": [ + "NOUN", + "NOUN", + "NOUN" + ], + "bigrams_a": [ + [ + "NOUN", + "NOUN" + ] + ], + "bigrams_b": [ + [ + "NOUN", + "NOUN" + ] + ], + "matching_bigrams": [ + [ + "NOUN", + "NOUN" + ] + ], + "method": "bigram", + "similarity": 1.0 + }, + "semantic": { + "zone_a_categories": [ + "technology", + "research" + ], + "zone_b_categories": [ + "technology" + ], + "matching_categories": [ + "technology" + ], + "method": "category", + "similarity": 0.5 + } + } + }, + { + "phonetic": 0.31017443249701315, + "structural": 0.0, + "semantic": 0.0, + "combined": 0.12406977299880527, + "details": { + "clause_a": "The findings confirmed that, even when holding the...", + "clause_b": "platform constant, learning from synthesized LLM r...", + "separator": "and", + "zone_a": [ + "confirmed", + "holding", + "facts" + ], + "zone_b": [ + "platform", + "constant", + "learning" + ], + "phonetic": { + "avg_similarity": 0.4754360812425329, + "initial_echo": 0.2, + "final_echo": 0.2, + "best_matches": [ + [ + "holding", + "learning", + "HH OW1 L D IH0 NG", + "L ER1 N IH0 NG", + 0.6451612903225806 + ] + ], + "zone_a_phonetics": [ + [ + "confirmed", + "K AH0 N F ER1 M D" + ], + [ + "holding", + "HH OW1 L D IH0 NG" + ], + [ + "facts", + "F AE1 K T S" + ] + ], + "zone_b_phonetics": [ + [ + "platform", + "P L AE1 T F AO2 R M" + ], + [ + "constant", + "K AA1 N S T AH0 N T" + ], + [ + "learning", + "L ER1 N IH0 NG" + ] + ] + }, + "structural": { + "pos_a": [ + "VERB", + "VERB", + "NOUN" + ], + "pos_b": [ + "NOUN", + "NOUN", + "VERB" + ], + "bigrams_a": [ + [ + "VERB", + "VERB" + ], + [ + "VERB", + "NOUN" + ] + ], + "bigrams_b": [ + [ + "NOUN", + "VERB" + ], + [ + "NOUN", + "NOUN" + ] + ], + "matching_bigrams": [], + "method": "bigram", + "similarity": 0.0 + }, + "semantic": { + "zone_a_categories": [], + "zone_b_categories": [ + "learning" + ], + "matching_categories": [], + "method": "category", + "similarity": 0.0 + } + } + }, + { + "phonetic": 0.28143703642641393, + "structural": 0.0, + "semantic": 0.0, + "combined": 0.11257481457056558, + "details": { + "clause_a": "The findings confirmed that, even when holding the...", + "clause_b": "synthesizing information for oneself via standard ...", + "separator": "and", + "zone_a": [ + "compared", + "gathering", + "interpreting" + ], + "zone_b": [ + "synthesizing", + "information", + "oneself" + ], + "phonetic": { + "avg_similarity": 0.5160925910660348, + "initial_echo": 0.0, + "final_echo": 0.25, + "best_matches": [ + [ + "compared", + "oneself", + "K AH0 M P EH1 R D", + "W AH2 N S EH1 L F", + 0.6470588235294117 + ], + [ + "gathering", + "synthesizing", + "G AE1 DH ER0 IH0 NG", + "S IH1 N TH AH0 S AY2 Z IH0 NG", + 0.5416666666666667 + ], + [ + "interpreting", + "synthesizing", + "IH1 N T ER0 P R EH2 T IH0 NG", + "S IH1 N TH AH0 S AY2 Z IH0 NG", + 0.7017543859649122 + ] + ], + "zone_a_phonetics": [ + [ + "compared", + "K AH0 M P EH1 R D" + ], + [ + "gathering", + "G AE1 DH ER0 IH0 NG" + ], + [ + "interpreting", + "IH1 N T ER0 P R EH2 T IH0 NG" + ] + ], + "zone_b_phonetics": [ + [ + "synthesizing", + "S IH1 N TH AH0 S AY2 Z IH0 NG" + ], + [ + "information", + "IH2 N F ER0 M EY1 SH AH0 N" + ], + [ + "oneself", + "W AH2 N S EH1 L F" + ] + ] + }, + "structural": { + "pos_a": [ + "VERB", + "VERB", + "VERB" + ], + "pos_b": [ + "VERB", + "NOUN", + "NOUN" + ], + "bigrams_a": [ + [ + "VERB", + "VERB" + ] + ], + "bigrams_b": [ + [ + "VERB", + "NOUN" + ], + [ + "NOUN", + "NOUN" + ] + ], + "matching_bigrams": [], + "method": "bigram", + "similarity": 0.0 + }, + "semantic": { + "zone_a_categories": [ + "comparison" + ], + "zone_b_categories": [ + "learning" + ], + "matching_categories": [], + "method": "category", + "similarity": 0.0 + } + } + }, + { + "phonetic": 0.25287761747520854, + "structural": 0.0, + "semantic": 0.0, + "combined": 0.10115104699008343, + "details": { + "clause_a": "The findings confirmed that, even when holding the...", + "clause_b": "oneself via standard web links.", + "separator": "or", + "zone_a": [ + "interpreting", + "synthesizing", + "information" + ], + "zone_b": [ + "oneself", + "via", + "standard" + ], + "phonetic": { + "avg_similarity": 0.38219404368802135, + "initial_echo": 0.3333333333333333, + "final_echo": 0.0, + "best_matches": [], + "zone_a_phonetics": [ + [ + "interpreting", + "IH1 N T ER0 P R EH2 T IH0 NG" + ], + [ + "synthesizing", + "S IH1 N TH AH0 S AY2 Z IH0 NG" + ], + [ + "information", + "IH2 N F ER0 M EY1 SH AH0 N" + ] + ], + "zone_b_phonetics": [ + [ + "oneself", + "W AH2 N S EH1 L F" + ], + [ + "via", + "V AY1 AH0" + ], + [ + "standard", + "S T AE1 N D ER0 D" + ] + ] + }, + "structural": { + "pos_a": [ + "VERB", + "VERB", + "NOUN" + ], + "pos_b": [ + "NOUN", + "NOUN", + "NOUN" + ], + "bigrams_a": [ + [ + "VERB", + "VERB" + ], + [ + "VERB", + "NOUN" + ] + ], + "bigrams_b": [ + [ + "NOUN", + "NOUN" + ] + ], + "matching_bigrams": [], + "method": "bigram", + "similarity": 0.0 + }, + "semantic": { + "zone_a_categories": [ + "learning" + ], + "zone_b_categories": [], + "matching_categories": [], + "method": "category", + "similarity": 0.0 + } + } + }, + { + "phonetic": 0.17164303742081521, + "structural": 0.5, + "semantic": 0.0, + "combined": 0.21865721496832607, + "details": { + "clause_a": "When we learn about a topic through Google search,...", + "clause_b": "We must navigate different web links, read informa...", + "separator": ":", + "zone_a": [ + "face", + "much", + "friction" + ], + "zone_b": [ + "navigate", + "different", + "web" + ], + "phonetic": { + "avg_similarity": 0.429107593552038, + "initial_echo": 0.0, + "final_echo": 0.0, + "best_matches": [ + [ + "face", + "web", + "F EY1 S", + "W EH1 B", + 0.5714285714285714 + ], + [ + "much", + "web", + "M AH1 CH", + "W EH1 B", + 0.5333333333333333 + ], + [ + "friction", + "different", + "F R IH1 K SH AH0 N", + "D IH1 F ER0 AH0 N T", + 0.6486486486486487 + ] + ], + "zone_a_phonetics": [ + [ + "face", + "F EY1 S" + ], + [ + "much", + "M AH1 CH" + ], + [ + "friction", + "F R IH1 K SH AH0 N" + ] + ], + "zone_b_phonetics": [ + [ + "navigate", + "N AE1 V AH0 G EY2 T" + ], + [ + "different", + "D IH1 F ER0 AH0 N T" + ], + [ + "web", + "W EH1 B" + ] + ] + }, + "structural": { + "pos_a": [ + "NOUN", + "NOUN", + "NOUN" + ], + "pos_b": [ + "VERB", + "NOUN", + "NOUN" + ], + "bigrams_a": [ + [ + "NOUN", + "NOUN" + ] + ], + "bigrams_b": [ + [ + "VERB", + "NOUN" + ], + [ + "NOUN", + "NOUN" + ] + ], + "matching_bigrams": [ + [ + "NOUN", + "NOUN" + ] + ], + "method": "bigram", + "similarity": 0.5 + }, + "semantic": { + "zone_a_categories": [ + "cognitive" + ], + "zone_b_categories": [], + "matching_categories": [], + "method": "category", + "similarity": 0.0 + } + } + }, + { + "phonetic": 0.3383787412008503, + "structural": 0.0, + "semantic": 0.0, + "combined": 0.13535149648034014, + "details": { + "clause_a": "When we learn about a topic through Google search,...", + "clause_b": "interpret and synthesize them ourselves.", + "separator": "and", + "zone_a": [ + "read", + "informational", + "sources" + ], + "zone_b": [ + "interpret", + "synthesize", + "ourselves" + ], + "phonetic": { + "avg_similarity": 0.47094685300212585, + "initial_echo": 0.5, + "final_echo": 0.0, + "best_matches": [ + [ + "informational", + "interpret", + "IH2 N F ER0 M EY1 SH AH0 N AH0 L", + "IH2 N T ER1 P R AH0 T", + 0.6415094339622642 + ], + [ + "informational", + "synthesize", + "IH2 N F ER0 M EY1 SH AH0 N AH0 L", + "S IH1 N TH AH0 S AY2 Z", + 0.5185185185185186 + ], + [ + "sources", + "synthesize", + "S AO1 R S AH0 Z", + "S IH1 N TH AH0 S AY2 Z", + 0.5945945945945945 + ] + ], + "zone_a_phonetics": [ + [ + "read", + "R EH1 D" + ], + [ + "informational", + "IH2 N F ER0 M EY1 SH AH0 N AH0 L" + ], + [ + "sources", + "S AO1 R S AH0 Z" + ] + ], + "zone_b_phonetics": [ + [ + "interpret", + "IH2 N T ER1 P R AH0 T" + ], + [ + "synthesize", + "S IH1 N TH AH0 S AY2 Z" + ], + [ + "ourselves", + "AW0 ER0 S EH1 L V Z" + ] + ] + }, + "structural": { + "pos_a": [ + "NOUN", + "ADJ", + "NOUN" + ], + "pos_b": [ + "NOUN", + "VERB", + "NOUN" + ], + "bigrams_a": [ + [ + "NOUN", + "ADJ" + ], + [ + "ADJ", + "NOUN" + ] + ], + "bigrams_b": [ + [ + "NOUN", + "VERB" + ], + [ + "VERB", + "NOUN" + ] + ], + "matching_bigrams": [], + "method": "bigram", + "similarity": 0.0 + }, + "semantic": { + "zone_a_categories": [ + "communication", + "learning", + "communication" + ], + "zone_b_categories": [], + "matching_categories": [], + "method": "category", + "similarity": 0.0 + } + } + }, + { + "phonetic": 0.4990350222543565, + "structural": 0.0, + "semantic": 0.0, + "combined": 0.1996140089017426, + "details": { + "clause_a": "When we learn about a topic through Google search,...", + "clause_b": "synthesize them ourselves.", + "separator": "and", + "zone_a": [ + "informational", + "sources", + "interpret" + ], + "zone_b": [ + "synthesize", + "ourselves" + ], + "phonetic": { + "avg_similarity": 0.49758755563589124, + "initial_echo": 1.0, + "final_echo": 0.0, + "best_matches": [ + [ + "informational", + "synthesize", + "IH2 N F ER0 M EY1 SH AH0 N AH0 L", + "S IH1 N TH AH0 S AY2 Z", + 0.5185185185185186 + ], + [ + "sources", + "synthesize", + "S AO1 R S AH0 Z", + "S IH1 N TH AH0 S AY2 Z", + 0.5945945945945945 + ], + [ + "sources", + "ourselves", + "S AO1 R S AH0 Z", + "AW0 ER0 S EH1 L V Z", + 0.5294117647058824 + ] + ], + "zone_a_phonetics": [ + [ + "informational", + "IH2 N F ER0 M EY1 SH AH0 N AH0 L" + ], + [ + "sources", + "S AO1 R S AH0 Z" + ], + [ + "interpret", + "IH2 N T ER1 P R AH0 T" + ] + ], + "zone_b_phonetics": [ + [ + "synthesize", + "S IH1 N TH AH0 S AY2 Z" + ], + [ + "ourselves", + "AW0 ER0 S EH1 L V Z" + ] + ] + }, + "structural": { + "pos_a": [ + "ADJ", + "NOUN", + "NOUN" + ], + "pos_b": [ + "VERB", + "NOUN" + ], + "bigrams_a": [ + [ + "ADJ", + "NOUN" + ], + [ + "NOUN", + "NOUN" + ] + ], + "bigrams_b": [ + [ + "VERB", + "NOUN" + ] + ], + "matching_bigrams": [], + "method": "bigram", + "similarity": 0.0 + }, + "semantic": { + "zone_a_categories": [ + "learning", + "communication" + ], + "zone_b_categories": [], + "matching_categories": [], + "method": "category", + "similarity": 0.0 + } + } + }, + { + "phonetic": 0.2750305463483758, + "structural": 0.0, + "semantic": 0.0, + "combined": 0.11001221853935034, + "details": { + "clause_a": "Rather, our message is that people simply need to ...", + "clause_b": "which starts by understanding the domains wherein ...", + "separator": "\u2013", + "zone_a": [ + "strategic", + "users", + "llms" + ], + "zone_b": [ + "starts", + "understanding", + "domains" + ], + "phonetic": { + "avg_similarity": 0.4375763658709395, + "initial_echo": 0.3333333333333333, + "final_echo": 0.0, + "best_matches": [ + [ + "strategic", + "understanding", + "S T R AH0 T IY1 JH IH0 K", + "AH2 N D ER0 S T AE1 N D IH0 NG", + 0.5185185185185186 + ] + ], + "zone_a_phonetics": [ + [ + "strategic", + "S T R AH0 T IY1 JH IH0 K" + ], + [ + "users", + "Y UW1 Z ER0 Z" + ] + ], + "zone_b_phonetics": [ + [ + "starts", + "S T AA1 R T S" + ], + [ + "understanding", + "AH2 N D ER0 S T AE1 N D IH0 NG" + ], + [ + "domains", + "D OW0 M EY1 N Z" + ] + ] + }, + "structural": { + "pos_a": [ + "ADJ", + "NOUN", + "NOUN" + ], + "pos_b": [ + "NOUN", + "VERB", + "NOUN" + ], + "bigrams_a": [ + [ + "ADJ", + "NOUN" + ], + [ + "NOUN", + "NOUN" + ] + ], + "bigrams_b": [ + [ + "NOUN", + "VERB" + ], + [ + "VERB", + "NOUN" + ] + ], + "matching_bigrams": [], + "method": "bigram", + "similarity": 0.0 + }, + "semantic": { + "zone_a_categories": [ + "technology" + ], + "zone_b_categories": [ + "learning" + ], + "matching_categories": [], + "method": "category", + "similarity": 0.0 + } + } + }, + { + "phonetic": 0.22292718276589246, + "structural": 0.3333333333333333, + "semantic": 0.0, + "combined": 0.189170873106357, + "details": { + "clause_a": "Rather, our message is that people simply need to ...", + "clause_b": "more strategic users of LLMs \u2013 which starts by und...", + "separator": "or", + "zone_a": [ + "need", + "become", + "smarter" + ], + "zone_b": [ + "strategic", + "users", + "llms" + ], + "phonetic": { + "avg_similarity": 0.3698179569147311, + "initial_echo": 0.25, + "final_echo": 0.0, + "best_matches": [], + "zone_a_phonetics": [ + [ + "need", + "N IY1 D" + ], + [ + "become", + "B IH0 K AH1 M" + ], + [ + "smarter", + "S M AA1 R T ER0" + ] + ], + "zone_b_phonetics": [ + [ + "strategic", + "S T R AH0 T IY1 JH IH0 K" + ], + [ + "users", + "Y UW1 Z ER0 Z" + ] + ] + }, + "structural": { + "pos_a": [ + "VERB", + "NOUN", + "NOUN" + ], + "pos_b": [ + "ADJ", + "NOUN", + "NOUN" + ], + "bigrams_a": [ + [ + "VERB", + "NOUN" + ], + [ + "NOUN", + "NOUN" + ] + ], + "bigrams_b": [ + [ + "ADJ", + "NOUN" + ], + [ + "NOUN", + "NOUN" + ] + ], + "matching_bigrams": [ + [ + "NOUN", + "NOUN" + ] + ], + "method": "bigram", + "similarity": 0.3333333333333333 + }, + "semantic": { + "zone_a_categories": [], + "zone_b_categories": [ + "technology" + ], + "matching_categories": [], + "method": "category", + "similarity": 0.0 + } + } + }, + { + "phonetic": 0.15166856535277587, + "structural": 0.5, + "semantic": 0.0, + "combined": 0.21066742614111034, + "details": { + "clause_a": "But if your aim is to develop deep", + "clause_b": "generalizable knowledge in an area, relying on LLM...", + "separator": "and", + "zone_a": [ + "aim", + "develop", + "deep" + ], + "zone_b": [ + "generalizable", + "knowledge", + "area" + ], + "phonetic": { + "avg_similarity": 0.37917141338193966, + "initial_echo": 0.0, + "final_echo": 0.0, + "best_matches": [ + [ + "develop", + "knowledge", + "D IH0 V EH1 L AH0 P", + "N AA1 L AH0 JH", + 0.5454545454545454 + ] + ], + "zone_a_phonetics": [ + [ + "aim", + "EY1 M" + ], + [ + "develop", + "D IH0 V EH1 L AH0 P" + ], + [ + "deep", + "D IY1 P" + ] + ], + "zone_b_phonetics": [ + [ + "knowledge", + "N AA1 L AH0 JH" + ], + [ + "area", + "EH1 R IY0 AH0" + ] + ] + }, + "structural": { + "pos_a": [ + "NOUN", + "NOUN", + "NOUN" + ], + "pos_b": [ + "ADJ", + "NOUN", + "NOUN" + ], + "bigrams_a": [ + [ + "NOUN", + "NOUN" + ] + ], + "bigrams_b": [ + [ + "ADJ", + "NOUN" + ], + [ + "NOUN", + "NOUN" + ] + ], + "matching_bigrams": [ + [ + "NOUN", + "NOUN" + ] + ], + "method": "bigram", + "similarity": 0.5 + }, + "semantic": { + "zone_a_categories": [ + "technology", + "comparison" + ], + "zone_b_categories": [ + "learning" + ], + "matching_categories": [], + "method": "category", + "similarity": 0.0 + } + } + }, + { + "phonetic": 0.30665850131978223, + "structural": 0.5, + "semantic": 0.0, + "combined": 0.2726634005279129, + "details": { + "clause_a": "As part of my research on the psychology of new te...", + "clause_b": "new media, I am also interested in whether it's po...", + "separator": "and", + "zone_a": [ + "psychology", + "new", + "technology" + ], + "zone_b": [ + "new", + "media", + "interested" + ], + "phonetic": { + "avg_similarity": 0.3916462532994555, + "initial_echo": 0.25, + "final_echo": 0.25, + "best_matches": [ + [ + "new", + "new", + "N UW1", + "N UW1", + 1.0 + ], + [ + "technology", + "interested", + "T EH0 K N AA1 L AH0 JH IY0", + "IH1 N T R AH0 S T IH0 D", + 0.5306122448979591 + ] + ], + "zone_a_phonetics": [ + [ + "psychology", + "S AY0 K AA1 L AH0 JH IY0" + ], + [ + "new", + "N UW1" + ], + [ + "technology", + "T EH0 K N AA1 L AH0 JH IY0" + ] + ], + "zone_b_phonetics": [ + [ + "new", + "N UW1" + ], + [ + "media", + "M IY1 D IY0 AH0" + ], + [ + "interested", + "IH1 N T R AH0 S T IH0 D" + ] + ] + }, + "structural": { + "pos_a": [ + "NOUN", + "NOUN", + "NOUN" + ], + "pos_b": [ + "NOUN", + "NOUN", + "VERB" + ], + "bigrams_a": [ + [ + "NOUN", + "NOUN" + ] + ], + "bigrams_b": [ + [ + "NOUN", + "VERB" + ], + [ + "NOUN", + "NOUN" + ] + ], + "matching_bigrams": [ + [ + "NOUN", + "NOUN" + ] + ], + "method": "bigram", + "similarity": 0.5 + }, + "semantic": { + "zone_a_categories": [ + "technology" + ], + "zone_b_categories": [], + "matching_categories": [], + "method": "category", + "similarity": 0.0 + } + } + }, + { + "phonetic": 0.31615746589158206, + "structural": 0.3333333333333333, + "semantic": 0.0, + "combined": 0.2264629863566328, + "details": { + "clause_a": "Building on this, in my future research I plan to ...", + "clause_b": "specifically, examining which types of guardrails ...", + "separator": "\u2013", + "zone_a": [ + "frictions", + "learning", + "tasks" + ], + "zone_b": [ + "specifically", + "examining", + "types" + ], + "phonetic": { + "avg_similarity": 0.4528936647289551, + "initial_echo": 0.25, + "final_echo": 0.2, + "best_matches": [ + [ + "learning", + "examining", + "L ER1 N IH0 NG", + "IH0 G Z AE1 M IH0 N IH0 NG", + 0.6 + ], + [ + "tasks", + "types", + "T AE1 S K S", + "T AY1 P S", + 0.7 + ] + ], + "zone_a_phonetics": [ + [ + "frictions", + "F R IH1 K SH AH0 N Z" + ], + [ + "learning", + "L ER1 N IH0 NG" + ], + [ + "tasks", + "T AE1 S K S" + ] + ], + "zone_b_phonetics": [ + [ + "specifically", + "S P AH0 S IH1 F IH0 K L IY0" + ], + [ + "examining", + "IH0 G Z AE1 M IH0 N IH0 NG" + ], + [ + "types", + "T AY1 P S" + ] + ] + }, + "structural": { + "pos_a": [ + "NOUN", + "VERB", + "NOUN" + ], + "pos_b": [ + "ADV", + "VERB", + "NOUN" + ], + "bigrams_a": [ + [ + "NOUN", + "VERB" + ], + [ + "VERB", + "NOUN" + ] + ], + "bigrams_b": [ + [ + "ADV", + "VERB" + ], + [ + "VERB", + "NOUN" + ] + ], + "matching_bigrams": [ + [ + "VERB", + "NOUN" + ] + ], + "method": "bigram", + "similarity": 0.3333333333333333 + }, + "semantic": { + "zone_a_categories": [ + "cognitive", + "learning" + ], + "zone_b_categories": [], + "matching_categories": [], + "method": "category", + "similarity": 0.0 + } + } + }, + { + "phonetic": 0.16810355026443022, + "structural": 0.0, + "semantic": 0.0, + "combined": 0.0672414201057721, + "details": { + "clause_a": "Building on this, in my future research I plan to ...", + "clause_b": "learning tasks \u2013 specifically, examining which typ...", + "separator": "or", + "zone_a": [ + "impose", + "healthy", + "frictions" + ], + "zone_b": [ + "learning", + "tasks", + "specifically" + ], + "phonetic": { + "avg_similarity": 0.4202588756610755, + "initial_echo": 0.0, + "final_echo": 0.0, + "best_matches": [ + [ + "healthy", + "specifically", + "HH EH1 L TH IY0", + "S P AH0 S IH1 F IH0 K L IY0", + 0.5238095238095238 + ], + [ + "frictions", + "learning", + "F R IH1 K SH AH0 N Z", + "L ER1 N IH0 NG", + 0.5294117647058824 + ] + ], + "zone_a_phonetics": [ + [ + "impose", + "IH2 M P OW1 Z" + ], + [ + "healthy", + "HH EH1 L TH IY0" + ], + [ + "frictions", + "F R IH1 K SH AH0 N Z" + ] + ], + "zone_b_phonetics": [ + [ + "learning", + "L ER1 N IH0 NG" + ], + [ + "tasks", + "T AE1 S K S" + ], + [ + "specifically", + "S P AH0 S IH1 F IH0 K L IY0" + ] + ] + }, + "structural": { + "pos_a": [ + "NOUN", + "NOUN", + "NOUN" + ], + "pos_b": [ + "VERB", + "NOUN", + "ADV" + ], + "bigrams_a": [ + [ + "NOUN", + "NOUN" + ] + ], + "bigrams_b": [ + [ + "NOUN", + "ADV" + ], + [ + "VERB", + "NOUN" + ] + ], + "matching_bigrams": [], + "method": "bigram", + "similarity": 0.0 + }, + "semantic": { + "zone_a_categories": [ + "cognitive" + ], + "zone_b_categories": [ + "learning" + ], + "matching_categories": [], + "method": "category", + "similarity": 0.0 + } + } + }, + { + "phonetic": 0.22321015104717593, + "structural": 0.3333333333333333, + "semantic": 0.2, + "combined": 0.24928406041887036, + "details": { + "clause_a": "Building on this, in my future research I plan to ...", + "clause_b": "speed bumps most successfully motivate users to ac...", + "separator": "or", + "zone_a": [ + "examining", + "types", + "guardrails" + ], + "zone_b": [ + "speed", + "bumps", + "successfully" + ], + "phonetic": { + "avg_similarity": 0.4080253776179398, + "initial_echo": 0.0, + "final_echo": 0.2, + "best_matches": [ + [ + "examining", + "successfully", + "IH0 G Z AE1 M IH0 N IH0 NG", + "S AH0 K S EH1 S F AH0 L IY0", + 0.5660377358490566 + ], + [ + "types", + "bumps", + "T AY1 P S", + "B AH1 M P S", + 0.7 + ] + ], + "zone_a_phonetics": [ + [ + "examining", + "IH0 G Z AE1 M IH0 N IH0 NG" + ], + [ + "types", + "T AY1 P S" + ], + [ + "guardrails", + "G AA1 R D R EY2 L Z" + ] + ], + "zone_b_phonetics": [ + [ + "speed", + "S P IY1 D" + ], + [ + "bumps", + "B AH1 M P S" + ], + [ + "successfully", + "S AH0 K S EH1 S F AH0 L IY0" + ] + ] + }, + "structural": { + "pos_a": [ + "VERB", + "NOUN", + "NOUN" + ], + "pos_b": [ + "VERB", + "NOUN", + "ADV" + ], + "bigrams_a": [ + [ + "VERB", + "NOUN" + ], + [ + "NOUN", + "NOUN" + ] + ], + "bigrams_b": [ + [ + "NOUN", + "ADV" + ], + [ + "VERB", + "NOUN" + ] + ], + "matching_bigrams": [ + [ + "VERB", + "NOUN" + ] + ], + "method": "bigram", + "similarity": 0.3333333333333333 + }, + "semantic": { + "method": "no_overlap", + "note": "No semantic data" + } + } + }, + { + "phonetic": 0.20172295584900632, + "structural": 1.0, + "semantic": 0.0, + "combined": 0.3806891823396025, + "details": { + "clause_a": "Such tools would seem particularly critical in sec...", + "clause_b": "challenge for educators is how best to equip stude...", + "separator": "or", + "zone_a": [ + "secondary", + "education", + "maj" + ], + "zone_b": [ + "challenge", + "educators", + "best" + ], + "phonetic": { + "avg_similarity": 0.5043073896225158, + "initial_echo": 0.0, + "final_echo": 0.0, + "best_matches": [ + [ + "secondary", + "challenge", + "S EH1 K AH0 N D EH2 R IY0", + "CH AE1 L AH0 N JH", + 0.5714285714285714 + ], + [ + "secondary", + "educators", + "S EH1 K AH0 N D EH2 R IY0", + "EH1 JH AH0 K EY2 T ER0 Z", + 0.6122448979591837 + ], + [ + "education", + "challenge", + "EH2 JH AH0 K EY1 SH AH0 N", + "CH AE1 L AH0 N JH", + 0.5714285714285714 + ] + ], + "zone_a_phonetics": [ + [ + "secondary", + "S EH1 K AH0 N D EH2 R IY0" + ], + [ + "education", + "EH2 JH AH0 K EY1 SH AH0 N" + ], + [ + "maj", + "M AE1 JH" + ] + ], + "zone_b_phonetics": [ + [ + "challenge", + "CH AE1 L AH0 N JH" + ], + [ + "educators", + "EH1 JH AH0 K EY2 T ER0 Z" + ], + [ + "best", + "B EH1 S T" + ] + ] + }, + "structural": { + "pos_a": [ + "NOUN", + "NOUN", + "NOUN" + ], + "pos_b": [ + "NOUN", + "NOUN", + "NOUN" + ], + "bigrams_a": [ + [ + "NOUN", + "NOUN" + ] + ], + "bigrams_b": [ + [ + "NOUN", + "NOUN" + ] + ], + "matching_bigrams": [ + [ + "NOUN", + "NOUN" + ] + ], + "method": "bigram", + "similarity": 1.0 + }, + "semantic": { + "zone_a_categories": [ + "learning" + ], + "zone_b_categories": [], + "matching_categories": [], + "method": "category", + "similarity": 0.0 + } + } + }, + { + "phonetic": 0.1790452416289031, + "structural": 1.0, + "semantic": 0.0, + "combined": 0.3716180966515612, + "details": { + "clause_a": "Such tools would seem particularly critical in sec...", + "clause_b": "educators is how best to equip students to develop...", + "separator": "or", + "zone_a": [ + "education", + "major", + "challenge" + ], + "zone_b": [ + "educators", + "best", + "equip" + ], + "phonetic": { + "avg_similarity": 0.4476131040722577, + "initial_echo": 0.0, + "final_echo": 0.0, + "best_matches": [ + [ + "education", + "educators", + "EH2 JH AH0 K EY1 SH AH0 N", + "EH1 JH AH0 K EY2 T ER0 Z", + 0.7346938775510203 + ] + ], + "zone_a_phonetics": [ + [ + "education", + "EH2 JH AH0 K EY1 SH AH0 N" + ], + [ + "major", + "M EY1 JH ER0" + ], + [ + "challenge", + "CH AE1 L AH0 N JH" + ] + ], + "zone_b_phonetics": [ + [ + "educators", + "EH1 JH AH0 K EY2 T ER0 Z" + ], + [ + "best", + "B EH1 S T" + ], + [ + "equip", + "IH0 K W IH1 P" + ] + ] + }, + "structural": { + "pos_a": [ + "NOUN", + "NOUN", + "NOUN" + ], + "pos_b": [ + "NOUN", + "NOUN", + "NOUN" + ], + "bigrams_a": [ + [ + "NOUN", + "NOUN" + ] + ], + "bigrams_b": [ + [ + "NOUN", + "NOUN" + ] + ], + "matching_bigrams": [ + [ + "NOUN", + "NOUN" + ] + ], + "method": "bigram", + "similarity": 1.0 + }, + "semantic": { + "zone_a_categories": [ + "learning" + ], + "zone_b_categories": [], + "matching_categories": [], + "method": "category", + "similarity": 0.0 + } + } + }, + { + "phonetic": 0.1577829577829578, + "structural": 0.0, + "semantic": 0.0, + "combined": 0.06311318311318313, + "details": { + "clause_a": "Such tools would seem particularly critical in sec...", + "clause_b": "math skills while also preparing for a real world ...", + "separator": "and", + "zone_a": [ + "foundational", + "reading", + "writing" + ], + "zone_b": [ + "math", + "skills", + "while" + ], + "phonetic": { + "avg_similarity": 0.3944573944573945, + "initial_echo": 0.0, + "final_echo": 0.0, + "best_matches": [ + [ + "writing", + "math", + "R AY1 T IH0 NG", + "M AE1 TH", + 0.5454545454545454 + ] + ], + "zone_a_phonetics": [ + [ + "foundational", + "F AW0 N D EY1 SH AH0 N AH0 L" + ], + [ + "reading", + "R IY1 D IH0 NG" + ], + [ + "writing", + "R AY1 T IH0 NG" + ] + ], + "zone_b_phonetics": [ + [ + "math", + "M AE1 TH" + ], + [ + "skills", + "S K IH1 L Z" + ], + [ + "while", + "W AY1 L" + ] + ] + }, + "structural": { + "pos_a": [ + "ADJ", + "VERB", + "VERB" + ], + "pos_b": [ + "NOUN", + "NOUN", + "NOUN" + ], + "bigrams_a": [ + [ + "VERB", + "VERB" + ], + [ + "ADJ", + "VERB" + ] + ], + "bigrams_b": [ + [ + "NOUN", + "NOUN" + ] + ], + "matching_bigrams": [], + "method": "bigram", + "similarity": 0.0 + }, + "semantic": { + "zone_a_categories": [ + "communication" + ], + "zone_b_categories": [ + "learning" + ], + "matching_categories": [], + "method": "category", + "similarity": 0.0 + } + } + }, + { + "phonetic": 0.2837476678967224, + "structural": 0.3333333333333333, + "semantic": 0.0, + "combined": 0.21349906715868894, + "details": { + "clause_a": "Such tools would seem particularly critical in sec...", + "clause_b": "a real world where LLMs are likely to be an integr...", + "separator": "or", + "zone_a": [ + "skills", + "while", + "preparing" + ], + "zone_b": [ + "real", + "world", + "llms" + ], + "phonetic": { + "avg_similarity": 0.521869169741806, + "initial_echo": 0.25, + "final_echo": 0.0, + "best_matches": [ + [ + "skills", + "real", + "S K IH1 L Z", + "R IY1 L", + 0.5555555555555556 + ], + [ + "while", + "real", + "W AY1 L", + "R IY1 L", + 0.7142857142857143 + ], + [ + "while", + "world", + "W AY1 L", + "W ER1 L D", + 0.625 + ] + ], + "zone_a_phonetics": [ + [ + "skills", + "S K IH1 L Z" + ], + [ + "while", + "W AY1 L" + ], + [ + "preparing", + "P R IY0 P EH1 R IH0 NG" + ] + ], + "zone_b_phonetics": [ + [ + "real", + "R IY1 L" + ], + [ + "world", + "W ER1 L D" + ] + ] + }, + "structural": { + "pos_a": [ + "NOUN", + "NOUN", + "VERB" + ], + "pos_b": [ + "ADJ", + "NOUN", + "NOUN" + ], + "bigrams_a": [ + [ + "NOUN", + "VERB" + ], + [ + "NOUN", + "NOUN" + ] + ], + "bigrams_b": [ + [ + "ADJ", + "NOUN" + ], + [ + "NOUN", + "NOUN" + ] + ], + "matching_bigrams": [ + [ + "NOUN", + "NOUN" + ] + ], + "method": "bigram", + "similarity": 0.3333333333333333 + }, + "semantic": { + "zone_a_categories": [ + "learning" + ], + "zone_b_categories": [ + "technology" + ], + "matching_categories": [], + "method": "category", + "similarity": 0.0 + } + } + }, + { + "phonetic": 0.14090319355967348, + "structural": 0.0, + "semantic": 0.0, + "combined": 0.0563612774238694, + "details": { + "clause_a": "And it's easy to understand their appeal: Ask a qu...", + "clause_b": "However, a new paper I co-authored offers experime...", + "separator": "however", + "zone_a": [ + "like", + "effortless", + "learning" + ], + "zone_b": [ + "new", + "paper", + "authored" + ], + "phonetic": { + "avg_similarity": 0.35225798389918367, + "initial_echo": 0.0, + "final_echo": 0.0, + "best_matches": [], + "zone_a_phonetics": [ + [ + "like", + "L AY1 K" + ], + [ + "effortless", + "EH1 F ER0 T L AH0 S" + ], + [ + "learning", + "L ER1 N IH0 NG" + ] + ], + "zone_b_phonetics": [ + [ + "new", + "N UW1" + ], + [ + "paper", + "P EY1 P ER0" + ], + [ + "authored", + "AO1 TH ER0 D" + ] + ] + }, + "structural": { + "pos_a": [ + "NOUN", + "ADJ", + "VERB" + ], + "pos_b": [ + "NOUN", + "NOUN", + "VERB" + ], + "bigrams_a": [ + [ + "NOUN", + "ADJ" + ], + [ + "ADJ", + "VERB" + ] + ], + "bigrams_b": [ + [ + "NOUN", + "VERB" + ], + [ + "NOUN", + "NOUN" + ] + ], + "matching_bigrams": [], + "method": "bigram", + "similarity": 0.0 + }, + "semantic": { + "zone_a_categories": [ + "cognitive", + "learning" + ], + "zone_b_categories": [ + "research" + ], + "matching_categories": [], + "method": "category", + "similarity": 0.0 + } + } + }, + { + "phonetic": 0.1648217377416344, + "structural": 0.0, + "semantic": 0.0, + "combined": 0.06592869509665376, + "details": { + "clause_a": "The data revealed a consistent pattern: People who...", + "clause_b": "In turn, when this advice was presented to an inde...", + "separator": "in turn", + "zone_a": [ + "less", + "factual", + "generic" + ], + "zone_b": [ + "turn", + "advice", + "presented" + ], + "phonetic": { + "avg_similarity": 0.41205434435408594, + "initial_echo": 0.0, + "final_echo": 0.0, + "best_matches": [ + [ + "less", + "turn", + "L EH1 S", + "T ER1 N", + 0.5714285714285714 + ], + [ + "generic", + "presented", + "JH AH0 N EH1 R IH0 K", + "P R IY0 Z EH1 N T IH0 D", + 0.6046511627906976 + ] + ], + "zone_a_phonetics": [ + [ + "less", + "L EH1 S" + ], + [ + "factual", + "F AE1 K CH UW0 AH0 L" + ], + [ + "generic", + "JH AH0 N EH1 R IH0 K" + ] + ], + "zone_b_phonetics": [ + [ + "turn", + "T ER1 N" + ], + [ + "advice", + "AE0 D V AY1 S" + ], + [ + "presented", + "P R IY0 Z EH1 N T IH0 D" + ] + ] + }, + "structural": { + "pos_a": [ + "ADJ", + "ADJ", + "ADJ" + ], + "pos_b": [ + "NOUN", + "NOUN", + "VERB" + ], + "bigrams_a": [ + [ + "ADJ", + "ADJ" + ] + ], + "bigrams_b": [ + [ + "NOUN", + "VERB" + ], + [ + "NOUN", + "NOUN" + ] + ], + "matching_bigrams": [], + "method": "bigram", + "similarity": 0.0 + }, + "semantic": { + "zone_a_categories": [ + "comparison" + ], + "zone_b_categories": [ + "communication" + ], + "matching_categories": [], + "method": "category", + "similarity": 0.0 + } + } + }, + { + "phonetic": 0.1702307364691266, + "structural": 0.5, + "semantic": 0.2, + "combined": 0.2780922945876506, + "details": { + "clause_a": "We found these differences to be robust across a v...", + "clause_b": "For example, one possible reason LLM users wrote b...", + "separator": "for example", + "zone_a": [ + "across", + "variety", + "contexts" + ], + "zone_b": [ + "example", + "one", + "possible" + ], + "phonetic": { + "avg_similarity": 0.42557684117281647, + "initial_echo": 0.0, + "final_echo": 0.0, + "best_matches": [ + [ + "variety", + "possible", + "V ER0 AY1 AH0 T IY0", + "P AA1 S AH0 B AH0 L", + 0.5263157894736843 + ] + ], + "zone_a_phonetics": [ + [ + "across", + "AH0 K R AO1 S" + ], + [ + "variety", + "V ER0 AY1 AH0 T IY0" + ], + [ + "contexts", + "K AA1 N T EH2 K S T S" + ] + ], + "zone_b_phonetics": [ + [ + "example", + "IH0 G Z AE1 M P AH0 L" + ], + [ + "one", + "W AH1 N" + ], + [ + "possible", + "P AA1 S AH0 B AH0 L" + ] + ] + }, + "structural": { + "pos_a": [ + "NOUN", + "NOUN", + "NOUN" + ], + "pos_b": [ + "NOUN", + "NOUN", + "ADJ" + ], + "bigrams_a": [ + [ + "NOUN", + "NOUN" + ] + ], + "bigrams_b": [ + [ + "NOUN", + "ADJ" + ], + [ + "NOUN", + "NOUN" + ] + ], + "matching_bigrams": [ + [ + "NOUN", + "NOUN" + ] + ], + "method": "bigram", + "similarity": 0.5 + }, + "semantic": { + "method": "no_overlap", + "note": "No semantic data" + } + } + }, + { + "phonetic": 0.1622173852165833, + "structural": 0.5, + "semantic": 0.0, + "combined": 0.21488695408663333, + "details": { + "clause_a": "To control for this possibility, we conducted an e...", + "clause_b": "Likewise, in another experiment we held constant t...", + "separator": "likewise", + "zone_a": [ + "google", + "chatgpt", + "searches" + ], + "zone_b": [ + "likewise", + "another", + "experiment" + ], + "phonetic": { + "avg_similarity": 0.4055434630414582, + "initial_echo": 0.0, + "final_echo": 0.0, + "best_matches": [], + "zone_a_phonetics": [ + [ + "google", + "G UW1 G AH0 L" + ], + [ + "searches", + "S ER1 CH IH0 Z" + ] + ], + "zone_b_phonetics": [ + [ + "likewise", + "L AY1 K W AY2 Z" + ], + [ + "another", + "AH0 N AH1 DH ER0" + ], + [ + "experiment", + "IH0 K S P EH1 R AH0 M AH0 N T" + ] + ] + }, + "structural": { + "pos_a": [ + "NOUN", + "NOUN", + "NOUN" + ], + "pos_b": [ + "VERB", + "NOUN", + "NOUN" + ], + "bigrams_a": [ + [ + "NOUN", + "NOUN" + ] + ], + "bigrams_b": [ + [ + "VERB", + "NOUN" + ], + [ + "NOUN", + "NOUN" + ] + ], + "matching_bigrams": [ + [ + "NOUN", + "NOUN" + ] + ], + "method": "bigram", + "similarity": 0.5 + }, + "semantic": { + "zone_a_categories": [ + "technology", + "technology", + "technology" + ], + "zone_b_categories": [ + "research" + ], + "matching_categories": [], + "method": "category", + "similarity": 0.0 + } + } + }, + { + "phonetic": 0.17639841639841644, + "structural": 0.0, + "semantic": 0.0, + "combined": 0.07055936655936658, + "details": { + "clause_a": "But with LLMs, this entire process is done on the ...", + "clause_b": "To be clear, we do not believe the solution to the...", + "separator": "to be clear", + "zone_a": [ + "active", + "passive", + "process" + ], + "zone_b": [ + "clear", + "believe", + "solution" + ], + "phonetic": { + "avg_similarity": 0.44099604099604106, + "initial_echo": 0.0, + "final_echo": 0.0, + "best_matches": [ + [ + "active", + "clear", + "AE1 K T IH0 V", + "K L IH1 R", + 0.5454545454545454 + ], + [ + "passive", + "solution", + "P AE1 S IH0 V", + "S AH0 L UW1 SH AH0 N", + 0.5454545454545454 + ] + ], + "zone_a_phonetics": [ + [ + "active", + "AE1 K T IH0 V" + ], + [ + "passive", + "P AE1 S IH0 V" + ], + [ + "process", + "P R AA1 S EH2 S" + ] + ], + "zone_b_phonetics": [ + [ + "clear", + "K L IH1 R" + ], + [ + "believe", + "B IH0 L IY1 V" + ], + [ + "solution", + "S AH0 L UW1 SH AH0 N" + ] + ] + }, + "structural": { + "pos_a": [ + "ADJ", + "ADJ", + "NOUN" + ], + "pos_b": [ + "NOUN", + "NOUN", + "NOUN" + ], + "bigrams_a": [ + [ + "ADJ", + "NOUN" + ], + [ + "ADJ", + "ADJ" + ] + ], + "bigrams_b": [ + [ + "NOUN", + "NOUN" + ] + ], + "matching_bigrams": [], + "method": "bigram", + "similarity": 0.0 + }, + "semantic": { + "zone_a_categories": [ + "cognitive", + "cognitive", + "cognitive" + ], + "zone_b_categories": [], + "matching_categories": [], + "method": "category", + "similarity": 0.0 + } + } + }, + { + "phonetic": 0.24402762559028546, + "structural": 0.5, + "semantic": 0.0, + "combined": 0.24761105023611418, + "details": { + "clause_a": "To be clear, we do not believe the solution to the...", + "clause_b": "Rather, our message is that people simply need to ...", + "separator": "rather", + "zone_a": [ + "offer", + "many", + "contexts" + ], + "zone_b": [ + "message", + "people", + "simply" + ], + "phonetic": { + "avg_similarity": 0.4225690639757136, + "initial_echo": 0.25, + "final_echo": 0.0, + "best_matches": [ + [ + "many", + "message", + "M EH1 N IY0", + "M EH1 S AH0 JH", + 0.64 + ], + [ + "many", + "simply", + "M EH1 N IY0", + "S IH1 M P L IY0", + 0.6153846153846154 + ] + ], + "zone_a_phonetics": [ + [ + "offer", + "AO1 F ER0" + ], + [ + "many", + "M EH1 N IY0" + ], + [ + "contexts", + "K AA1 N T EH2 K S T S" + ] + ], + "zone_b_phonetics": [ + [ + "message", + "M EH1 S AH0 JH" + ], + [ + "people", + "P IY1 P AH0 L" + ], + [ + "simply", + "S IH1 M P L IY0" + ] + ] + }, + "structural": { + "pos_a": [ + "NOUN", + "NOUN", + "NOUN" + ], + "pos_b": [ + "NOUN", + "NOUN", + "ADV" + ], + "bigrams_a": [ + [ + "NOUN", + "NOUN" + ] + ], + "bigrams_b": [ + [ + "NOUN", + "ADV" + ], + [ + "NOUN", + "NOUN" + ] + ], + "matching_bigrams": [ + [ + "NOUN", + "NOUN" + ] + ], + "method": "bigram", + "similarity": 0.5 + }, + "semantic": { + "zone_a_categories": [], + "zone_b_categories": [ + "communication" + ], + "matching_categories": [], + "method": "category", + "similarity": 0.0 + } + } + } + ] +} \ No newline at end of file diff --git a/scripts/manual_analysis.py b/scripts/manual_analysis.py new file mode 100644 index 0000000..b8bdd55 --- /dev/null +++ b/scripts/manual_analysis.py @@ -0,0 +1,654 @@ +#!/usr/bin/env python3 +"""Manual Echo Rule Watermark Analysis Script. + +This script performs a manual analysis of text for Echo Rule watermark patterns +when the full spaCy model is not available. It implements the core concepts +from the SpecHO pipeline using available tools. + +The Echo Rule watermark detection looks for three types of echoes between +clause boundaries: +1. Phonetic echoes - similar sounds at clause joints +2. Structural echoes - parallel grammatical structures +3. Semantic echoes - thematically related word choices +""" + +import re +import sys +from pathlib import Path +from typing import List, Dict, Tuple, Optional +from dataclasses import dataclass, field +from collections import Counter +import json + +# Try to import available libraries +try: + import cmudict + CMU_DICT = cmudict.dict() + HAS_CMU = True +except ImportError: + HAS_CMU = False + CMU_DICT = {} + +try: + from Levenshtein import ratio as levenshtein_ratio + HAS_LEVENSHTEIN = True +except ImportError: + HAS_LEVENSHTEIN = False + +try: + import numpy as np + HAS_NUMPY = True +except ImportError: + HAS_NUMPY = False + + +@dataclass +class ClausePair: + """Represents a pair of adjacent clauses for analysis.""" + clause_a: str + clause_b: str + separator: str # What separates them (punctuation, conjunction, etc.) + zone_a: List[str] # Terminal words from clause_a + zone_b: List[str] # Initial words from clause_b + + +@dataclass +class EchoScore: + """Scores for a single clause pair.""" + phonetic: float = 0.0 + structural: float = 0.0 + semantic: float = 0.0 + combined: float = 0.0 + details: Dict = field(default_factory=dict) + + +@dataclass +class AnalysisReport: + """Complete analysis report for a document.""" + text_length: int + word_count: int + sentence_count: int + clause_pairs_found: int + echo_scores: List[EchoScore] = field(default_factory=list) + average_phonetic: float = 0.0 + average_structural: float = 0.0 + average_semantic: float = 0.0 + final_score: float = 0.0 + verdict: str = "UNKNOWN" + confidence: float = 0.0 + reasoning: List[str] = field(default_factory=list) + + +# ============================================================================ +# CONTENT WORD DETECTION +# ============================================================================ + +# Common function words to exclude from echo analysis +FUNCTION_WORDS = { + 'the', 'a', 'an', 'and', 'or', 'but', 'if', 'then', 'when', 'where', 'who', + 'what', 'which', 'how', 'why', 'is', 'are', 'was', 'were', 'be', 'been', + 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', + 'should', 'may', 'might', 'can', 'shall', 'must', 'to', 'of', 'in', 'for', + 'on', 'with', 'at', 'by', 'from', 'as', 'into', 'through', 'during', 'before', + 'after', 'above', 'below', 'between', 'under', 'over', 'out', 'up', 'down', + 'off', 'about', 'than', 'so', 'that', 'this', 'these', 'those', 'it', 'its', + 'they', 'them', 'their', 'we', 'us', 'our', 'you', 'your', 'he', 'him', 'his', + 'she', 'her', 'i', 'me', 'my', 'not', 'no', 'yes', 'all', 'each', 'every', + 'both', 'few', 'more', 'most', 'other', 'some', 'such', 'any', 'only', 'same', + 'also', 'just', 'even', 'very', 'too', 'however', 'therefore', 'thus', + 'meanwhile', 'moreover', 'furthermore', 'nonetheless', 'nevertheless', + 'indeed', 'instead', 'rather', 'yet', 'still', 'already', 'always', 'never', + 'often', 'sometimes', 'usually', 'perhaps', 'probably', 'certainly' +} + +TRANSITION_PHRASES = [ + 'however', 'therefore', 'thus', 'meanwhile', 'moreover', 'furthermore', + 'nonetheless', 'nevertheless', 'in turn', 'rather', 'indeed', 'instead', + 'as a result', 'on the other hand', 'in contrast', 'similarly', 'likewise', + 'to be clear', 'in other words', 'for example', 'for instance' +] + + +def is_content_word(word: str) -> bool: + """Check if a word is a content word (not a function word).""" + return word.lower() not in FUNCTION_WORDS and len(word) > 2 + + +def get_content_words(text: str) -> List[str]: + """Extract content words from text.""" + words = re.findall(r'\b[a-zA-Z]+\b', text.lower()) + return [w for w in words if is_content_word(w)] + + +# ============================================================================ +# CLAUSE IDENTIFICATION +# ============================================================================ + +def identify_sentences(text: str) -> List[str]: + """Split text into sentences.""" + # Handle common abbreviations + text = re.sub(r'\b(Dr|Mr|Mrs|Ms|Prof|Inc|Ltd|Jr|Sr)\.\s', r'\1_DOT_ ', text) + text = re.sub(r'\b(e\.g|i\.e)\.\s', r'\1_DOT_ ', text) + + # Split on sentence-ending punctuation + sentences = re.split(r'(?<=[.!?])\s+', text) + + # Restore dots + sentences = [s.replace('_DOT_', '.') for s in sentences] + + return [s.strip() for s in sentences if s.strip()] + + +def identify_clause_pairs(text: str) -> List[ClausePair]: + """Identify clause pairs using punctuation and conjunction rules.""" + pairs = [] + sentences = identify_sentences(text) + + for sentence in sentences: + # Rule A: Punctuation-linked clauses (semicolon, em-dash, colon) + for sep in [';', ' – ', ': ', ' — ']: + if sep in sentence: + parts = sentence.split(sep) + for i in range(len(parts) - 1): + if len(parts[i].strip()) > 10 and len(parts[i+1].strip()) > 10: + pairs.append(create_clause_pair(parts[i], parts[i+1], sep)) + + # Rule B: Conjunction-linked clauses + conj_pattern = r',?\s*(but|and|or|yet)\s+' + matches = list(re.finditer(conj_pattern, sentence, re.IGNORECASE)) + for match in matches: + before = sentence[:match.start()] + after = sentence[match.end():] + if len(before.strip()) > 10 and len(after.strip()) > 10: + pairs.append(create_clause_pair(before, after, match.group(1))) + + # Rule C: Transition-linked sentences (check consecutive sentences) + for i in range(len(sentences) - 1): + sentence_b = sentences[i + 1] + for transition in TRANSITION_PHRASES: + if sentence_b.lower().startswith(transition.lower()): + pairs.append(create_clause_pair(sentences[i], sentence_b, transition)) + break + + return pairs + + +def create_clause_pair(clause_a: str, clause_b: str, separator: str) -> ClausePair: + """Create a ClausePair with extracted zones.""" + words_a = get_content_words(clause_a) + words_b = get_content_words(clause_b) + + # Zone A: last 3 content words + zone_a = words_a[-3:] if len(words_a) >= 3 else words_a + + # Zone B: first 3 content words + zone_b = words_b[:3] if len(words_b) >= 3 else words_b + + return ClausePair( + clause_a=clause_a.strip(), + clause_b=clause_b.strip(), + separator=separator.strip(), + zone_a=zone_a, + zone_b=zone_b + ) + + +# ============================================================================ +# PHONETIC ANALYSIS +# ============================================================================ + +def get_phonetic(word: str) -> Optional[str]: + """Get ARPAbet phonetic transcription for a word.""" + if not HAS_CMU: + return None + + word_lower = word.lower() + if word_lower in CMU_DICT: + # Return first pronunciation, joined + return ' '.join(CMU_DICT[word_lower][0]) + return None + + +def analyze_phonetic_echo(zone_a: List[str], zone_b: List[str]) -> Tuple[float, Dict]: + """Analyze phonetic similarity between zones.""" + if not HAS_CMU or not HAS_LEVENSHTEIN: + return 0.0, {'error': 'Required libraries not available'} + + # Get phonetics for all words + phonetics_a = [(w, get_phonetic(w)) for w in zone_a] + phonetics_b = [(w, get_phonetic(w)) for w in zone_b] + + # Filter to words with phonetics + phonetics_a = [(w, p) for w, p in phonetics_a if p] + phonetics_b = [(w, p) for w, p in phonetics_b if p] + + if not phonetics_a or not phonetics_b: + return 0.0, {'note': 'No phonetic data available'} + + # Calculate pairwise similarities + similarities = [] + best_matches = [] + for word_a, phon_a in phonetics_a: + for word_b, phon_b in phonetics_b: + sim = levenshtein_ratio(phon_a, phon_b) + similarities.append(sim) + if sim > 0.5: + best_matches.append((word_a, word_b, phon_a, phon_b, sim)) + + avg_sim = sum(similarities) / len(similarities) if similarities else 0.0 + + # Check for specific echo patterns + # Initial consonant clusters + initial_echo = check_initial_consonants(phonetics_a, phonetics_b) + + # Final sounds (rhyming) + final_echo = check_final_sounds(phonetics_a, phonetics_b) + + # Combine scores (weighted) + combined = 0.4 * avg_sim + 0.3 * initial_echo + 0.3 * final_echo + + return combined, { + 'avg_similarity': avg_sim, + 'initial_echo': initial_echo, + 'final_echo': final_echo, + 'best_matches': best_matches[:3], + 'zone_a_phonetics': phonetics_a, + 'zone_b_phonetics': phonetics_b + } + + +def check_initial_consonants(phonetics_a: List, phonetics_b: List) -> float: + """Check for alliterative patterns (matching initial consonants).""" + initials_a = set() + initials_b = set() + + for _, phon in phonetics_a: + if phon: + first_phone = phon.split()[0] if phon else '' + # Strip stress markers from vowels + first_phone = re.sub(r'[0-9]', '', first_phone) + if first_phone and first_phone[0] not in 'AEIOU': + initials_a.add(first_phone) + + for _, phon in phonetics_b: + if phon: + first_phone = phon.split()[0] if phon else '' + first_phone = re.sub(r'[0-9]', '', first_phone) + if first_phone and first_phone[0] not in 'AEIOU': + initials_b.add(first_phone) + + if not initials_a or not initials_b: + return 0.0 + + # Jaccard similarity of initial consonants + intersection = initials_a & initials_b + union = initials_a | initials_b + + return len(intersection) / len(union) if union else 0.0 + + +def check_final_sounds(phonetics_a: List, phonetics_b: List) -> float: + """Check for rhyming patterns (matching final sounds).""" + finals_a = set() + finals_b = set() + + for _, phon in phonetics_a: + if phon: + phones = phon.split() + if len(phones) >= 2: + # Last two phonemes (stripped of stress) + final = ' '.join(re.sub(r'[0-9]', '', p) for p in phones[-2:]) + finals_a.add(final) + + for _, phon in phonetics_b: + if phon: + phones = phon.split() + if len(phones) >= 2: + final = ' '.join(re.sub(r'[0-9]', '', p) for p in phones[-2:]) + finals_b.add(final) + + if not finals_a or not finals_b: + return 0.0 + + intersection = finals_a & finals_b + union = finals_a | finals_b + + return len(intersection) / len(union) if union else 0.0 + + +# ============================================================================ +# STRUCTURAL ANALYSIS +# ============================================================================ + +def get_simple_pos(word: str) -> str: + """Simple rule-based POS approximation.""" + word_lower = word.lower() + + # Common verb endings + if word_lower.endswith(('ing', 'ed', 'ize', 'ise', 'ate')): + return 'VERB' + # Common noun endings + if word_lower.endswith(('tion', 'ment', 'ness', 'ity', 'ism', 'ist', 'er', 'or')): + return 'NOUN' + # Common adjective endings + if word_lower.endswith(('ive', 'ous', 'ful', 'less', 'able', 'ible', 'al', 'ic')): + return 'ADJ' + # Common adverb endings + if word_lower.endswith('ly'): + return 'ADV' + + return 'NOUN' # Default + + +def analyze_structural_echo(zone_a: List[str], zone_b: List[str]) -> Tuple[float, Dict]: + """Analyze structural (POS pattern) similarity between zones.""" + if not zone_a or not zone_b: + return 0.0, {'error': 'Empty zones'} + + # Get POS tags + pos_a = [get_simple_pos(w) for w in zone_a] + pos_b = [get_simple_pos(w) for w in zone_b] + + # Create bigrams + bigrams_a = set() + bigrams_b = set() + + for i in range(len(pos_a) - 1): + bigrams_a.add((pos_a[i], pos_a[i+1])) + for i in range(len(pos_b) - 1): + bigrams_b.add((pos_b[i], pos_b[i+1])) + + # Jaccard similarity of bigrams + if not bigrams_a or not bigrams_b: + # Fall back to unigram comparison + pos_set_a = set(pos_a) + pos_set_b = set(pos_b) + intersection = pos_set_a & pos_set_b + union = pos_set_a | pos_set_b + unigram_sim = len(intersection) / len(union) if union else 0.0 + return unigram_sim, { + 'pos_a': pos_a, + 'pos_b': pos_b, + 'method': 'unigram', + 'similarity': unigram_sim + } + + intersection = bigrams_a & bigrams_b + union = bigrams_a | bigrams_b + bigram_sim = len(intersection) / len(union) if union else 0.0 + + return bigram_sim, { + 'pos_a': pos_a, + 'pos_b': pos_b, + 'bigrams_a': list(bigrams_a), + 'bigrams_b': list(bigrams_b), + 'matching_bigrams': list(intersection), + 'method': 'bigram', + 'similarity': bigram_sim + } + + +# ============================================================================ +# SEMANTIC ANALYSIS +# ============================================================================ + +# Word categories for basic semantic grouping +SEMANTIC_CATEGORIES = { + 'learning': ['learn', 'learning', 'knowledge', 'understand', 'education', 'skill', 'study', 'information', 'insight', 'comprehension'], + 'technology': ['llm', 'chatgpt', 'ai', 'model', 'google', 'search', 'tool', 'technology', 'digital', 'computer', 'algorithm'], + 'research': ['study', 'research', 'experiment', 'paper', 'finding', 'data', 'evidence', 'participant', 'result', 'analysis'], + 'cognitive': ['think', 'thought', 'cognitive', 'mental', 'process', 'active', 'passive', 'engage', 'effort', 'friction'], + 'comparison': ['compare', 'versus', 'difference', 'similar', 'contrast', 'better', 'worse', 'more', 'less', 'shallow', 'deep'], + 'communication': ['write', 'read', 'advice', 'information', 'message', 'response', 'summary', 'synthesis', 'source'], +} + + +def get_semantic_category(word: str) -> Optional[str]: + """Get semantic category for a word.""" + word_lower = word.lower() + for category, words in SEMANTIC_CATEGORIES.items(): + if word_lower in words or any(word_lower.startswith(w) for w in words): + return category + return None + + +def analyze_semantic_echo(zone_a: List[str], zone_b: List[str]) -> Tuple[float, Dict]: + """Analyze semantic relatedness between zones.""" + if not zone_a or not zone_b: + return 0.0, {'error': 'Empty zones'} + + # Get categories + cats_a = [get_semantic_category(w) for w in zone_a] + cats_b = [get_semantic_category(w) for w in zone_b] + + # Filter None values + cats_a = [c for c in cats_a if c] + cats_b = [c for c in cats_b if c] + + # Check for overlapping categories + if not cats_a and not cats_b: + # No categorized words - use simple word overlap + words_a = set(w.lower() for w in zone_a) + words_b = set(w.lower() for w in zone_b) + overlap = words_a & words_b + if overlap: + return 0.8, {'method': 'exact_overlap', 'overlapping': list(overlap)} + return 0.2, {'method': 'no_overlap', 'note': 'No semantic data'} + + cats_set_a = set(cats_a) + cats_set_b = set(cats_b) + + intersection = cats_set_a & cats_set_b + union = cats_set_a | cats_set_b + + category_sim = len(intersection) / len(union) if union else 0.0 + + return category_sim, { + 'zone_a_categories': cats_a, + 'zone_b_categories': cats_b, + 'matching_categories': list(intersection), + 'method': 'category', + 'similarity': category_sim + } + + +# ============================================================================ +# MAIN ANALYSIS +# ============================================================================ + +def analyze_text(text: str) -> AnalysisReport: + """Perform complete Echo Rule watermark analysis on text.""" + # Basic stats + sentences = identify_sentences(text) + words = re.findall(r'\b[a-zA-Z]+\b', text) + + report = AnalysisReport( + text_length=len(text), + word_count=len(words), + sentence_count=len(sentences), + clause_pairs_found=0 + ) + + # Identify clause pairs + pairs = identify_clause_pairs(text) + report.clause_pairs_found = len(pairs) + + if not pairs: + report.verdict = "INSUFFICIENT_DATA" + report.reasoning.append("No suitable clause pairs found for echo analysis") + return report + + # Analyze each pair + phonetic_scores = [] + structural_scores = [] + semantic_scores = [] + + for pair in pairs: + # Phonetic analysis + phon_score, phon_details = analyze_phonetic_echo(pair.zone_a, pair.zone_b) + + # Structural analysis + struct_score, struct_details = analyze_structural_echo(pair.zone_a, pair.zone_b) + + # Semantic analysis + sem_score, sem_details = analyze_semantic_echo(pair.zone_a, pair.zone_b) + + # Combined score (using Tier 1 weights: 40% phonetic, 30% structural, 30% semantic) + combined = 0.4 * phon_score + 0.3 * struct_score + 0.3 * sem_score + + echo_score = EchoScore( + phonetic=phon_score, + structural=struct_score, + semantic=sem_score, + combined=combined, + details={ + 'clause_a': pair.clause_a[:50] + '...' if len(pair.clause_a) > 50 else pair.clause_a, + 'clause_b': pair.clause_b[:50] + '...' if len(pair.clause_b) > 50 else pair.clause_b, + 'separator': pair.separator, + 'zone_a': pair.zone_a, + 'zone_b': pair.zone_b, + 'phonetic': phon_details, + 'structural': struct_details, + 'semantic': sem_details + } + ) + + report.echo_scores.append(echo_score) + phonetic_scores.append(phon_score) + structural_scores.append(struct_score) + semantic_scores.append(sem_score) + + # Calculate averages + report.average_phonetic = sum(phonetic_scores) / len(phonetic_scores) + report.average_structural = sum(structural_scores) / len(structural_scores) + report.average_semantic = sum(semantic_scores) / len(semantic_scores) + + # Final document score (Tier 1: simple average of combined scores) + combined_scores = [es.combined for es in report.echo_scores] + report.final_score = sum(combined_scores) / len(combined_scores) + + # Generate verdict + # These thresholds are based on the Echo Rule watermark detection spec + # Human text typically scores 0.15-0.30, AI with watermark scores 0.45+ + if report.final_score >= 0.45: + report.verdict = "HIGH_PROBABILITY_AI" + report.confidence = min(0.95, 0.5 + report.final_score) + report.reasoning.append(f"High echo score ({report.final_score:.3f}) suggests Echo Rule watermark presence") + elif report.final_score >= 0.35: + report.verdict = "MODERATE_PROBABILITY_AI" + report.confidence = 0.3 + report.final_score + report.reasoning.append(f"Moderate echo score ({report.final_score:.3f}) - possible watermark presence") + elif report.final_score >= 0.25: + report.verdict = "LOW_PROBABILITY_AI" + report.confidence = 0.2 + report.final_score * 0.5 + report.reasoning.append(f"Low echo score ({report.final_score:.3f}) - unlikely watermark presence") + else: + report.verdict = "LIKELY_HUMAN" + report.confidence = max(0.1, 0.6 - report.final_score) + report.reasoning.append(f"Very low echo score ({report.final_score:.3f}) - consistent with human writing") + + # Add specific observations + if report.average_phonetic > 0.4: + report.reasoning.append(f"Elevated phonetic echoes ({report.average_phonetic:.3f}) - sound patterns at clause boundaries") + if report.average_structural > 0.5: + report.reasoning.append(f"Strong structural parallelism ({report.average_structural:.3f}) - similar grammatical patterns") + if report.average_semantic > 0.4: + report.reasoning.append(f"Semantic coherence ({report.average_semantic:.3f}) - related concepts across boundaries") + + return report + + +def print_report(report: AnalysisReport, verbose: bool = False): + """Print formatted analysis report.""" + print("\n" + "=" * 70) + print(" SPECULATIVE ECHO RULE WATERMARK ANALYSIS REPORT") + print("=" * 70) + + print(f"\n{'Document Statistics':=^50}") + print(f" Text length: {report.text_length:,} characters") + print(f" Word count: {report.word_count:,} words") + print(f" Sentences: {report.sentence_count}") + print(f" Clause pairs analyzed: {report.clause_pairs_found}") + + print(f"\n{'Echo Scores':=^50}") + print(f" Phonetic echo (avg): {report.average_phonetic:.4f}") + print(f" Structural echo (avg): {report.average_structural:.4f}") + print(f" Semantic echo (avg): {report.average_semantic:.4f}") + print(f" {'─' * 40}") + print(f" FINAL SCORE: {report.final_score:.4f}") + + print(f"\n{'Verdict':=^50}") + print(f" Status: {report.verdict}") + print(f" Confidence: {report.confidence:.1%}") + + print(f"\n{'Chain of Reasoning':=^50}") + for i, reason in enumerate(report.reasoning, 1): + print(f" {i}. {reason}") + + if verbose and report.echo_scores: + print(f"\n{'Detailed Clause Pair Analysis':=^50}") + for i, score in enumerate(report.echo_scores[:10], 1): + print(f"\n Pair {i}:") + print(f" Separator: '{score.details.get('separator', 'N/A')}'") + print(f" Zone A (end): {score.details.get('zone_a', [])}") + print(f" Zone B (start): {score.details.get('zone_b', [])}") + print(f" Phonetic: {score.phonetic:.3f}, Structural: {score.structural:.3f}, Semantic: {score.semantic:.3f}") + print(f" Combined: {score.combined:.3f}") + + print("\n" + "=" * 70) + + +def main(): + """Main entry point.""" + # Read input file + input_file = Path('/home/user/specHO/data/analysis_input.txt') + + if not input_file.exists(): + print("Error: Input file not found") + sys.exit(1) + + text = input_file.read_text() + print(f"Analyzing text ({len(text):,} characters)...") + + # Run analysis + report = analyze_text(text) + + # Print report + print_report(report, verbose=True) + + # Save JSON report + output_file = Path('/home/user/specHO/data/analysis_output.json') + report_dict = { + 'text_length': report.text_length, + 'word_count': report.word_count, + 'sentence_count': report.sentence_count, + 'clause_pairs_found': report.clause_pairs_found, + 'average_phonetic': report.average_phonetic, + 'average_structural': report.average_structural, + 'average_semantic': report.average_semantic, + 'final_score': report.final_score, + 'verdict': report.verdict, + 'confidence': report.confidence, + 'reasoning': report.reasoning, + 'echo_scores': [ + { + 'phonetic': es.phonetic, + 'structural': es.structural, + 'semantic': es.semantic, + 'combined': es.combined, + 'details': es.details + } + for es in report.echo_scores + ] + } + + with open(output_file, 'w') as f: + json.dump(report_dict, f, indent=2, default=str) + + print(f"\nJSON report saved to: {output_file}") + + return report + + +if __name__ == '__main__': + main()