From 511ce3c3dbe617a46cf7e55c0ea3140b4808a69d Mon Sep 17 00:00:00 2001 From: bob Date: Mon, 20 Aug 2018 14:18:21 -0400 Subject: [PATCH] TS-9 fianl change --- TextSearch/Common/Constants.ecl | 2 +- TextSearch/Common/Default_Keywording.ecl | 5 +- TextSearch/Common/FileName_Info.ecl | 13 +- TextSearch/Common/FileName_Info_Instance.ecl | 6 +- TextSearch/Common/FileNames.ecl | 28 +-- TextSearch/Common/IKeywording.ecl | 4 +- TextSearch/Common/Layouts.ecl | 2 +- TextSearch/Common/NumericCollationFormat.ecl | 4 +- TextSearch/Common/Pattern_Definitions.ecl | 3 +- TextSearch/Common/Types.ecl | 14 +- TextSearch/Inverted/Base_Data.ecl | 2 +- TextSearch/Inverted/Basic_Key_List.ecl | 21 -- TextSearch/Inverted/Build_Slice_Action.ecl | 16 +- TextSearch/Inverted/Layouts.ecl | 21 +- TextSearch/Inverted/Manage_Superkeys.ecl | 39 --- TextSearch/Inverted/ParsedText.ecl | 86 ++++++- TextSearch/Inverted/RawPostings.ecl | 8 +- TextSearch/Inverted/SpecialPostings.ecl | 2 +- TextSearch/Inverted/check.ecl | 54 ++++ TextSearch/Inverted/check2.ecl | 90 +++++++ TextSearch/Inverted/check3.ecl | 186 ++++++++++++++ TextSearch/Inverted/initialism.ecl | 108 ++++++++ TextSearch/Inverted/john1.ecl | 84 +++++++ TextSearch/Inverted/john2.ecl | 46 ++++ TextSearch/Inverted/moby.ecl | 33 +++ TextSearch/Inverted/states.ecl | 150 +++++++++++ TextSearch/Inverted/test_moby.ecl | 52 ++++ TextSearch/Inverted/try2.ecl | 133 ++++++++++ TextSearch/Inverted/try4.ecl | 247 +++++++++++++++++++ TextSearch/Inverted/word2vec_1.ecl | 102 ++++++++ TextSearch/Inverted/word2vec_2.ecl | 108 ++++++++ TextSearch/Inverted/word2vec_3.ecl | 144 +++++++++++ 32 files changed, 1656 insertions(+), 157 deletions(-) delete mode 100644 TextSearch/Inverted/Basic_Key_List.ecl delete mode 100644 TextSearch/Inverted/Manage_Superkeys.ecl create mode 100644 TextSearch/Inverted/check.ecl create mode 100644 TextSearch/Inverted/check2.ecl create mode 100644 TextSearch/Inverted/check3.ecl create mode 100644 TextSearch/Inverted/initialism.ecl create mode 100644 TextSearch/Inverted/john1.ecl create mode 100644 TextSearch/Inverted/john2.ecl create mode 100644 TextSearch/Inverted/moby.ecl create mode 100644 TextSearch/Inverted/states.ecl create mode 100644 TextSearch/Inverted/test_moby.ecl create mode 100644 TextSearch/Inverted/try2.ecl create mode 100644 TextSearch/Inverted/try4.ecl create mode 100644 TextSearch/Inverted/word2vec_1.ecl create mode 100644 TextSearch/Inverted/word2vec_2.ecl create mode 100644 TextSearch/Inverted/word2vec_3.ecl diff --git a/TextSearch/Common/Constants.ecl b/TextSearch/Common/Constants.ecl index 18ad6b7..87d96e9 100644 --- a/TextSearch/Common/Constants.ecl +++ b/TextSearch/Common/Constants.ecl @@ -1,4 +1,4 @@ -EXPORT Constants := MODULE +EXPORT Constants := MODULE // Limit Constants EXPORT Max_SearchTerms := 1000; EXPORT Max_Ops := 2 * Max_SearchTerms; diff --git a/TextSearch/Common/Default_Keywording.ecl b/TextSearch/Common/Default_Keywording.ecl index 06a0cb5..b7d483c 100644 --- a/TextSearch/Common/Default_Keywording.ecl +++ b/TextSearch/Common/Default_Keywording.ecl @@ -1,10 +1,11 @@ //Default implementation. Provides minimal functionality. -IMPORT Std.Uni; +//from me this file give defult values to varibles and interface in Ikeywording file +IMPORT Std.Uni; //to use lower case or upper case (from me) IMPORT TextSearch.Common; IMPORT TextSearch.Common.Types; IMPORT TextSearch.Common.Layouts; TermString := Types.TermString; -EquivTerm := Layouts.EquivTerm; +EquivTerm := Layouts.EquivTerm; //store all record Version := Types.Version; NoEquiv := DATASET([],EquivTerm); ToUpper := Uni.ToUpperCase; diff --git a/TextSearch/Common/FileName_Info.ecl b/TextSearch/Common/FileName_Info.ecl index 6455ce1..41dcde7 100644 --- a/TextSearch/Common/FileName_Info.ecl +++ b/TextSearch/Common/FileName_Info.ecl @@ -1,13 +1,8 @@ -//FileName Info structured used for file name generation. -//This version includes pre-Slice management hack to support tracking update -//versions with incremental updates. -EXPORT FileName_Info := INTERFACE +EXPORT FileName_Info := INTERFACE EXPORT STRING Prefix; EXPORT STRING Instance; // the version for an individual instance or the Alias EXPORT STRING AliasInstance := 'CURRENT'; - EXPORT SET OF STRING AliasInstances := [AliasInstance, 'LAST', 'PAST', 'DELETED']; - EXPORT UNSIGNED2 Naming := 1; // version of naming system - EXPORT UNSIGNED2 DataVersion := 0; // placeholder for data version to build - EXPORT UNSIGNED1 Levels := 5; - EXPORT STRING UseInstance(UNSIGNED indx) := IF(indx=0, Instance, AliasInstances[indx]); + EXPORT UNSIGNED2 Naming := 1; + EXPORT UNSIGNED2 DataVersion := 0; + EXPORT UNSIGNED1 Levels := 5;//from 0 to 4 END; diff --git a/TextSearch/Common/FileName_Info_Instance.ecl b/TextSearch/Common/FileName_Info_Instance.ecl index e6df211..b3638e9 100644 --- a/TextSearch/Common/FileName_Info_Instance.ecl +++ b/TextSearch/Common/FileName_Info_Instance.ecl @@ -1,8 +1,8 @@ -//Instance of the FileName_Info block. Used to unify the names used by TextSearch. +//Instance of the FileName_Info block. Used to unify the names used by TextSearch. IMPORT TextSearch.Common; IMPORT STD.Str; Info := Common.FileName_Info; -EXPORT FileName_Info_Instance(STRING aPre, STRING aInst) := MODULE(Info) +EXPORT FileName_Info_Instance(STRING aPre, STRING aInst) := MODULE(Info)//mean this module will have all field in FileName_Info STRING wPrefix := TRIM(Str.ToUpperCase(aPre),ALL); EXPORT STRING Prefix := IF(wPrefix<>'', wPrefix, @@ -10,5 +10,5 @@ EXPORT FileName_Info_Instance(STRING aPre, STRING aInst) := MODULE(Info) Common.Constants.No_Prfx_code, (STRING)Common.Constants.No_Prfx_Msg)); STRING wInst := TRIM(Str.ToUpperCase(aInst),ALL); - EXPORT STRING Instance := IF(wInst<>'', wInst, AliasInstance); + EXPORT STRING Instance := IF(wInst<>'', wInst, AliasInstance);//AliasInstance='current' END; \ No newline at end of file diff --git a/TextSearch/Common/FileNames.ecl b/TextSearch/Common/FileNames.ecl index 5705e77..a71e23f 100644 --- a/TextSearch/Common/FileNames.ecl +++ b/TextSearch/Common/FileNames.ecl @@ -1,4 +1,4 @@ -IMPORT TextSearch.Common; +IMPORT TextSearch.Common; //Creates file names. The names are both the names of the individual //logical files and the container names used as aliases for a group //of file instances. @@ -9,11 +9,11 @@ IMPORT TextSearch.Common; // Instance is FileName.Instance; and Suffix is the data type as below. FileName_Info := Common.FileName_Info; -EXPORT FileNames(FileName_Info info, UNSIGNED Alias=0) := MODULE +EXPORT FileNames(FileName_Info info) := MODULE //to set name of doc SHARED DocSearchPrefix := '::DocSearch::Level-'; SHARED Name(STRING suffix, UNSIGNED lvl) := info.Prefix + DocSearchPrefix + INTFORMAT(lvl, 2, 1) + '::' - + info.UseInstance(Alias) + '::' + suffix; + + info.Instance + '::' + suffix; EXPORT DocumentIndex(UNSIGNED lvl=0) := Name('DocIndx', lvl); EXPORT TriGramDictionary(UNSIGNED lvl=0) := Name('TriDctIndx', lvl); @@ -28,26 +28,4 @@ EXPORT FileNames(FileName_Info info, UNSIGNED Alias=0) := MODULE EXPORT TagDictionary(UNSIGNED lvl=0) := Name('TagIndx', lvl); EXPORT IdentIndx(UNSIGNED1 lvl=0) := Name('IdentIndx', lvl); EXPORT DeleteIndex(UNSIGNED1 lvl=0) := NAME('DelIndx', lvl); - EXPORT NameEnum := Common.Types.FileEnum; - EXPORT NameByEnum(NameEnum ne, UNSIGNED1 lvl=0) - := CASE(ne, - NameEnum.DocumentIndex => DocumentIndex(lvl), - NameEnum.TriGramDictionary => TriGramDictionary(lvl), - NameEnum.TermDictionary => TermDictionary(lvl), - NameEnum.TriGramIndex => TriGramIndex(lvl), - NameEnum.TermIndex => TermIndex(lvl), - NameEnum.PhraseIndex => PhraseIndex(lvl), - NameEnum.ElementIndex => ElementIndex(lvl), - NameEnum.AttributeIndex => AttributeIndex(lvl), - NameEnum.RangeIndex => RangeIndex(lvl), - NameEnum.NameSpaceDict => NameSpaceDict(lvl), - NameEnum.TagDictionary => TagDictionary(lvl), - NameEnum.IdentIndx => IdentIndx(lvl), - NameEnum.DeleteIndex => DeleteIndex(lvl), - Name('BadEnum', lvl)); - // the currently building keys. Add triGramDictionary and TriGramIndex when ready - EXPORT NameSet := [NameEnum.DocumentIndex, NameEnum.TermDictionary, NameEnum.TermIndex, - NameEnum.PhraseIndex, NameEnum.ElementIndex, NameEnum.AttributeIndex, - NameEnum.RangeIndex, NameEnum.TagDictionary, NameEnum.IdentIndx, - NameEnum.DeleteIndex]; END; \ No newline at end of file diff --git a/TextSearch/Common/IKeywording.ecl b/TextSearch/Common/IKeywording.ecl index 4d2a608..79ec85d 100644 --- a/TextSearch/Common/IKeywording.ecl +++ b/TextSearch/Common/IKeywording.ecl @@ -3,11 +3,11 @@ IMPORT TextSearch.Common.Types; IMPORT TextSearch.Common.Layouts; TermString := Types.TermString; -EquivTerm := Layouts.EquivTerm; +EquivTerm := Layouts.EquivTerm;//record Version := Types.Version; EXPORT IKeywording := INTERFACE - EXPORT Version currentVersion; + EXPORT Version currentVersion; //define currentversion from version type fro me EXPORT BOOLEAN hasEquivalence(TermString trm, Version v=currentVersion); EXPORT TermString SingleKeyword(TermString trm, Version v=currentVersion); EXPORT DATASET(EquivTerm) EquivKeywords(TermString trm, Version v=currentVersion); diff --git a/TextSearch/Common/Layouts.ecl b/TextSearch/Common/Layouts.ecl index 20a4272..f571b73 100644 --- a/TextSearch/Common/Layouts.ecl +++ b/TextSearch/Common/Layouts.ecl @@ -2,7 +2,7 @@ IMPORT TextSearch.Common.Types; IMPORT TextSearch.Inverted.Layouts AS InvertedLayouts; EXPORT Layouts := MODULE - EXPORT DocIndex := RECORD(InvertedLayouts.Document-content) + EXPORT DocIndex := RECORD(InvertedLayouts.Document-content)//I think here error -content what is? Types.KWP keywords; Types.Position docLength; STRING18 wunit; diff --git a/TextSearch/Common/NumericCollationFormat.ecl b/TextSearch/Common/NumericCollationFormat.ecl index e73956a..8641c51 100644 --- a/TextSearch/Common/NumericCollationFormat.ecl +++ b/TextSearch/Common/NumericCollationFormat.ecl @@ -1,4 +1,4 @@ -/* +/* Layout in big endian: @@ -39,7 +39,7 @@ EXPORT NumericCollationFormat := MODULE int expnt = 0; char ch = '\0'; - for (int i = 0; i < (int)lenNumstr; i++) + for (int i = 0; i < lenNumstr; i++) { if ((ch = numstr[i]) == '.') { diff --git a/TextSearch/Common/Pattern_Definitions.ecl b/TextSearch/Common/Pattern_Definitions.ecl index 3662a91..01302be 100644 --- a/TextSearch/Common/Pattern_Definitions.ecl +++ b/TextSearch/Common/Pattern_Definitions.ecl @@ -1,4 +1,4 @@ -EXPORT Pattern_Definitions := MACRO +EXPORT Pattern_Definitions := MACRO // Pure Whitespace PATTERN LowControl := PATTERN(U'[\u0001-\u0008\u000B\u000C\u000E\u000F]'); PATTERN HighControl := PATTERN(U'[\u007F-\u009F]'); @@ -100,6 +100,7 @@ EXPORT Pattern_Definitions := MACRO // Composite patterns // Word strings PATTERN Letter := PATTERN(U'[[:alpha:]]'); + //PATTERN NOLetter := PATTERN(U'[^[:alpha:]]'); PATTERN LowerCase := PATTERN(U'[[:lower:]]'); PATTERN UpperCase := PATTERN(U'[[:upper:]]'); PATTERN Digit := PATTERN(U'[[:digit:]]'); diff --git a/TextSearch/Common/Types.ecl b/TextSearch/Common/Types.ecl index 9128267..9061713 100644 --- a/TextSearch/Common/Types.ecl +++ b/TextSearch/Common/Types.ecl @@ -1,4 +1,4 @@ -// Types for search system +// Types for search system EXPORT Types := MODULE EXPORT DocNo := UNSIGNED4; @@ -16,8 +16,9 @@ EXPORT Types := MODULE SymbolChar, // Ampersand, Section, et cetera NoiseChar, // Noise, such as a comma or Tab WhiteSpace, // blanks - SpecialStr); // special keyword string - EXPORT TermTypeAsString(TermType typ) := CASE(typ, + SpecialStr, // special keyword string + AcroStr); //Initialism and Acronyms + EXPORT TermTypeAsString(TermType typ) := CASE(typ, //meaning if value is 1 then let value of type=Text string 1 => V'Text String', 2 => V'Number', 3 => V'Date', @@ -27,9 +28,10 @@ EXPORT Types := MODULE 7 => V'Noise Character', 8 => V'White Space', 9 => V'Special Keyword', + 10 => V'Initialism and Acronyms', V'Unknown'); EXPORT KeywordTTypes := [TermType.TextStr, TermType.Number, - TermType.Date, TermType.SymbolChar]; + TermType.Date, TermType.SymbolChar,TermType.AcroStr]; EXPORT InvertTTypes := [TermType.TextStr, TermType.Number, TermType.Date, TermType.Meta, TermType.Tag, TermType.SymbolChar, @@ -87,8 +89,4 @@ EXPORT Types := MODULE EXPORT DocIdentifier := UNICODE; EXPORT SequenceKey := STRING50; EXPORT SlugLine := UNICODE; - EXPORT FileEnum := ENUM(UNSIGNED1, Unknown=0, DocumentIndex, TriGramDictionary, - TermDictionary, TriGramIndex, TermIndex, PhraseIndex, - ELementIndex, AttributeIndex, RangeIndex, NameSpaceDict, - TagDictionary, IdentIndx, DeleteIndex); END; \ No newline at end of file diff --git a/TextSearch/Inverted/Base_Data.ecl b/TextSearch/Inverted/Base_Data.ecl index 69a5aac..03bed52 100644 --- a/TextSearch/Inverted/Base_Data.ecl +++ b/TextSearch/Inverted/Base_Data.ecl @@ -9,7 +9,7 @@ EXPORT Base_Data(Common.FileName_Info info, DATASET(Inv_Layouts.DocumentIngest) docsIn):= MODULE // The documents must be enumerated SHARED keyword_mod := Common.Default_Keywording; - EXPORT enumDocs := Inverted.EnumeratedDocs(info, docsIn); + EXPORT enumDocs := Inverted.EnumeratedDocs(info, docsIn);//start here EXPORT rawPostings := Inverted.RawPostings(enumDocs); EXPORT DocIndex := Inverted.DocIndex(enumDocs, UNGROUP(rawPostings)); // Need to get Replaced doc list diff --git a/TextSearch/Inverted/Basic_Key_List.ecl b/TextSearch/Inverted/Basic_Key_List.ecl deleted file mode 100644 index 1104cf7..0000000 --- a/TextSearch/Inverted/Basic_Key_List.ecl +++ /dev/null @@ -1,21 +0,0 @@ -IMPORT TextSearch.Inverted.Layouts; -IMPORT TextSearch.Common; - -FileName_Info := Common.FileName_Info; -FileName_Info_Instance := Common.FileName_Info_Instance; -FileNames := Common.FileNames; -Types := Common.Types; - -EXPORT DATASET(Layouts.Managed_File_Names) Basic_Key_List(FileName_Info info) := FUNCTION - Layouts.Managed_File_Names makeEntry(Types.FileEnum name) := TRANSFORM - SELF.logical_name := FileNames(info, 0).NameByEnum(name); - SELF.current_name := FileNames(info, 1).NameByEnum(name); - SELF.previous_name := FileNames(info, 2).NameByEnum(name); - SELF.past_previous_name := FileNames(info, 3).NameByEnum(name); - SELF.deleted_name := FileNames(info, 4).NameByEnum(name); - SELF.delete_deleted := TRUE; - SELF.task := Layouts.Management_Task.Replace; - END; - ds := DATASET(COUNT(FileNames(info).NameSet), makeEntry(FileNames(info).NameSet[COUNTER])); - RETURN ds; -END; \ No newline at end of file diff --git a/TextSearch/Inverted/Build_Slice_Action.ecl b/TextSearch/Inverted/Build_Slice_Action.ecl index a9ba797..437bca9 100644 --- a/TextSearch/Inverted/Build_Slice_Action.ecl +++ b/TextSearch/Inverted/Build_Slice_Action.ecl @@ -1,14 +1,10 @@ // The action for building a slice, given the name of the Ingest file, and the //prefix and instance for the file names. -// Optional parameter is a dataset used to list other files that we want managed. IMPORT TextSearch.Common; IMPORT TextSearch.Inverted; Ingest := Inverted.Layouts.DocumentIngest; -Managed_File_Names := Inverted.Layouts.Managed_File_Names; -empty := DATASET([], Managed_File_Names); -EXPORT Build_Slice_Action(STRING ingestName, STRING prfx, STRING inst, - DATASET(Managed_File_Names) mfn=empty) := FUNCTION +EXPORT Build_Slice_Action(STRING ingestName, STRING prfx, STRING inst) := FUNCTION inDocs := DATASET(ingestName, Ingest, THOR); info := Common.FileName_Info_Instance(prfx, inst); kwm := Common.Default_Keywording; @@ -22,7 +18,7 @@ EXPORT Build_Slice_Action(STRING ingestName, STRING prfx, STRING inst, TrmDict := base.TermDict; TagDict := base.TagDict; Replaced := base.ReplacedDocs; - bc := PARALLEL( + ac := PARALLEL( BUILD(Common.Keys(info).TermIndex(TrmPosts)) ,BUILD(Common.Keys(info).ElementIndex(tagposts)) ,BUILD(Common.Keys(info).PhraseIndex(PhrsPosts)) @@ -34,13 +30,5 @@ EXPORT Build_Slice_Action(STRING ingestName, STRING prfx, STRING inst, ,BUILD(Common.Keys(info).IdentIndex(docIndx)) ,BUILD(Common.Keys(info).DeleteIndex(Replaced)) ); - Task_Enum := Inverted.Layouts.Management_Task; - good_mfn := ASSERT(mfn, (task=Task_Enum.NoOp) - OR (task=Task_Enum.Replace AND logical_name<>'' - AND current_name<>'' AND previous_name<>'' - AND past_previous_name<>'' AND deleted_name<>'' ), - 'Missing required file names for action', FAIL); - key_list := Inverted.Basic_Key_List(info) + good_mfn; - ac := SEQUENTIAL(bc, Inverted.Manage_Superkeys(info, key_list)); RETURN ac; END; \ No newline at end of file diff --git a/TextSearch/Inverted/Layouts.ecl b/TextSearch/Inverted/Layouts.ecl index 67fe79b..07c9d94 100644 --- a/TextSearch/Inverted/Layouts.ecl +++ b/TextSearch/Inverted/Layouts.ecl @@ -7,11 +7,13 @@ EXPORT Layouts := MODULE Types.SequenceKey seqKey; Types.SlugLine slugLine; UNICODE content; + UNICODE init; + //string init_w_pun; END; EXPORT DocumentNo := RECORD - Types.DocNo id; + Types.DocNo id; //Types.DocNo mean set the type of DocNo to id END; - EXPORT Document := RECORD(DocumentIngest) + EXPORT Document := RECORD(DocumentIngest) // this mean the Document record will have all the fields in DocumentIngest and all the filelds in DocumentNo DocumentNo; END; // Posting Record, generated by parsing the documents. @@ -34,16 +36,5 @@ EXPORT Layouts := MODULE Types.TermString tagValue; Types.PathString pathString; Types.TermString parentName; - END; - // Record for the machinery to manage file names with super keys (super files) - EXPORT Management_Task := ENUM(UNSIGNED1, NoOp=0, Replace); // Future - EXPORT Managed_File_Names := RECORD - STRING logical_name; - STRING current_name; - STRING previous_name; - STRING past_previous_name; - STRING deleted_name; - BOOLEAN delete_deleted; - Management_Task task; - END; -END; \ No newline at end of file + END; +END; diff --git a/TextSearch/Inverted/Manage_Superkeys.ecl b/TextSearch/Inverted/Manage_Superkeys.ecl deleted file mode 100644 index 2b684a6..0000000 --- a/TextSearch/Inverted/Manage_Superkeys.ecl +++ /dev/null @@ -1,39 +0,0 @@ -// Version for pre-Slice keys. -// Assumes that user replaces the collection. -// -IMPORT TextSearch.Common; -IMPORT TextSearch.Inverted; -IMPORT TextSeaRch.Inverted.Layouts; -IMPORT STD.File AS FS; - -Managed_File_Names := Layouts.Managed_File_Names; -Management_Task := Layouts.Management_Task; -FileName_Info := Common.FileName_Info; - - -EXPORT Manage_Superkeys(FileName_Info info, DATASET(Managed_File_Names) mfn) := FUNCTION - ac := SEQUENTIAL( - // Make sure the aliases exist, create as necessary - NOTHOR(APPLY(mfn, - IF(NOT FS.SuperFileExists(current_name), FS.CreateSuperFile(current_name)) - ,IF(NOT FS.SuperFileExists(previous_name), FS.CreateSuperFile(previous_name)) - ,IF(NOT FS.SuperFileExists(past_previous_name), FS.CreateSuperFile(past_previous_name)) - ,IF(NOT FS.SuperFileExists(deleted_name), FS.CreateSuperFile(deleted_name)) - )) - ,OUTPUT(mfn, NAMED('Files_List')) - ,FS.StartSuperFileTransaction() - ,NOTHOR(APPLY(mfn, - FS.SwapSuperFile(deleted_name, past_previous_name) - ,FS.SwapSuperFile(past_previous_name, previous_name) - ,FS.SwapSuperFile(previous_name, current_name) - ,FS.ClearSuperFile(current_name) - ,FS.AddSuperFile(current_name, logical_name) - )) - ,FS.FinishSuperFileTransaction() - ,NOTHOR(APPLY(mfn, - FS.RemoveOwnedSubFiles(deleted_name, delete_deleted) - ,FS.ClearSuperFile(deleted_name) - )) - ); - RETURN ac; -END; \ No newline at end of file diff --git a/TextSearch/Inverted/ParsedText.ecl b/TextSearch/Inverted/ParsedText.ecl index 59e7d40..720e9ef 100644 --- a/TextSearch/Inverted/ParsedText.ecl +++ b/TextSearch/Inverted/ParsedText.ecl @@ -1,7 +1,8 @@ -// Parse contents of the document +// Parse contents of the document IMPORT TextSearch; IMPORT TextSearch.Common; IMPORT TextSearch.Inverted.Layouts; +IMPORT STD; Document := Layouts.Document; RawPosting := Layouts.RawPosting; Types := Common.Types; @@ -37,11 +38,27 @@ EXPORT DATASET(RawPosting) ParsedText(DATASET(Document) docsInput) := FUNCTION PATTERN EmptyEnd := REPEAT(AttrListItem) OPT(Spaces) U'/>'; PATTERN XMLElement := U'<' XMLName BEFORE ContainerEnd; PATTERN XMLEmpty := U'<' XMLName BEFORE EmptyEnd; + PATTERN expr2 :=PATTERN(U'[a-zA-Z]+[.][a-zA-Z]+[.][a-zA-Z]*[.]*[a-zA-Z]*'); + PATTERN expr3 :=PATTERN(U'[a-zA-Z]+[.][a-zA-Z]+'); + - RULE myRule := XMLDecl OR XMLComment OR XMLElement OR XMLEmpty OR + + + PATTERN alpha := PATTERN('[A-Za-z]+'); + PATTERN ws := [' ']*; + + + + + + + + + + RULE myRule := expr2 ws or alpha expr2 or expr3 OR WordAllLower OR WordAllUpper OR WordTitleCase or WordMixedCase OR XMLDecl OR XMLComment OR XMLElement OR XMLEmpty OR AttributeExpr OR EndElement OR TagEndSeq OR - WordAlphaNum OR WhiteSpace OR PoundCode OR - SymbolChar OR Noise OR AnyChar OR AnyPair; + WordAlphaNum OR WhiteSpace OR PoundCode OR + SymbolChar OR Noise OR AnyChar OR AnyPair OR WordNoLetters ;//or NoHenWord | Article ws Word;//update RawPosting parseString(Document doc) := TRANSFORM SELF.id := doc.id;; @@ -58,13 +75,32 @@ EXPORT DATASET(RawPosting) ParsedText(DATASET(Document) docsInput) := FUNCTION MATCHED(WordAlphaNum) => MATCHLENGTH(MyRule), MATCHED(AnyChar) => MATCHLENGTH(MyRule), MATCHED(AnyPair) => MATCHLENGTH(MyRule), + MATCHED(expr2) => MATCHLENGTH(MyRule), + MATCHED(expr3) => MATCHLENGTH(MyRule), + //MATCHED(expr4) => MATCHLENGTH(MyRule), + MATCHED(WordAllUpper) => MATCHLENGTH(MyRule), + MATCHED(WordAllLower) => MATCHLENGTH(MyRule), + MATCHED(WordMixedCase) => MATCHLENGTH(MyRule), + MATCHED(WordNoLetters) => MATCHLENGTH(MyRule), + MATCHED(WordTitleCase) => MATCHLENGTH(MyRule), + + 0); SELF.keywords := MAP( MATCHED(SymbolChar) => 1, MATCHED(WordAlphaNum) => 1, MATCHED(AnyChar) => 1, MATCHED(AnyPair) => 1, - 0); + MATCHED(expr2) => MATCHLENGTH(expr2)- STD.Str.FindCount((STRING)MATCHTEXT(expr2), '.'),//new addition//track dot here + MATCHED(expr3) => MATCHLENGTH(expr3)- STD.Str.FindCount((STRING)MATCHTEXT(expr3), '.'), + MATCHED(WordAllUpper) =>1, + MATCHED(WordAllLower) =>1, + MATCHED(WordMixedCase) =>1, + MATCHED(WordTitleCase) =>1, + MATCHED(WordNoLetters) =>1, + + + 0); SELF.typTerm := MAP( MATCHED(WhiteSpace) => Types.TermType.WhiteSpace, MATCHED(SymbolChar) => Types.TermType.SymbolChar, @@ -80,7 +116,15 @@ EXPORT DATASET(RawPosting) ParsedText(DATASET(Document) docsInput) := FUNCTION MATCHED(EndElement) => Types.TermType.Tag, MATCHED(TagEndSeq) => Types.TermType.Tag, MATCHED(PoundCode) => Types.TermType.TextStr, - Types.TermType.Unknown); + MATCHED(expr2) => Types.TermType.AcroStr,//new addition + MATCHED(expr3) => Types.TermType.AcroStr,//new addition + MATCHED(WordAllUpper) => Types.TermType.TextStr, + MATCHED(WordAllLower) => Types.TermType.TextStr, + MATCHED(WordMixedCase) => Types.TermType.TextStr, + MATCHED(WordTitleCase) => Types.TermType.TextStr, + MATCHED(WordNoLetters) => Types.TermType.SymbolChar, + + Types.TermType.Unknown); SELF.typData := MAP( MATCHED(WhiteSpace) => Types.DataType.RawData, MATCHED(SymbolChar) => Types.DataType.RawData, @@ -97,8 +141,17 @@ EXPORT DATASET(RawPosting) ParsedText(DATASET(Document) docsInput) := FUNCTION MATCHED(EndElement) => Types.DataType.EndElement, MATCHED(TagEndSeq) => Types.DataType.TagEndSeq, MATCHED(PoundCode) => Types.DataType.RawData, + MATCHED(expr2) => Types.DataType.RawData, + MATCHED(expr3) => Types.DataType.RawData, + MATCHED(WordAllUpper) => Types.DataType.RawData, + MATCHED(WordAllLower) => Types.DataType.RawData, + MATCHED(WordMixedCase) => Types.DataType.RawData, + MATCHED(WordTitleCase) => Types.DataType.RawData, + MATCHED(WordNoLetters) => Types.DataType.RawData, + //if(SELF.depth>0,Types.DataType.PCDATA,Types.DataType.RawData ), Types.DataType.Unknown); - SELF.tagValue := MAP( + + SELF.tagValue := MAP( NOT MATCHED(AttributeExpr) => U'', MATCHED(QuotValueWrap) => MATCHUNICODE(AnyNoQuoteStr), MATCHED(AposValueWrap) => MATCHUNICODE(AnyNoAposStr), @@ -113,10 +166,23 @@ EXPORT DATASET(RawPosting) ParsedText(DATASET(Document) docsInput) := FUNCTION SELF.preorder := 0; SELF.parentOrd := 0; SELF.parentName:= U''; - SELF.lp := Types.LetterPattern.Unknown; + SELF.lp := MAP( + MATCHED(WordAllUpper) => Types.LetterPattern.UpperCase, + MATCHED(WordAllLower) => Types.LetterPattern.LowerCase, + MATCHED(WordMixedCase) => Types.LetterPattern.MixedCase, + MATCHED(WordNoLetters) => Types.LetterPattern.NoLetters, + MATCHED(WordTitleCase) => Types.LetterPattern.TitleCase, + + + + Types.LetterPattern.Unknown); + SELF.term := MATCHUNICODE(MyRule); + + //SELF.initalism :=MATCHTEXT(MyRule); END; p0 := PARSE(docsInput, content, myRule, parseString(LEFT), MAX, MANY, NOT MATCHED); - p1 := ASSERT(p0, typTerm<>Types.TermType.Unknown, Constants.OtherCharsInText_Msg); - RETURN p1(typTerm <> Types.TermType.WhiteSpace); + //p1 := ASSERT(p0, typTerm<>Types.TermType.Unknown, Constants.OtherCharsInText_Msg); + RETURN p0(typTerm <> Types.TermType.WhiteSpace);// change p1 to p0 here + //Return p0;//addition END; \ No newline at end of file diff --git a/TextSearch/Inverted/RawPostings.ecl b/TextSearch/Inverted/RawPostings.ecl index 0ab9c1c..89b9fc9 100644 --- a/TextSearch/Inverted/RawPostings.ecl +++ b/TextSearch/Inverted/RawPostings.ecl @@ -1,4 +1,4 @@ -//Convert raw content into posting records +//Convert raw content into posting records IMPORT TextSearch.Common; IMPORT TextSearch.Common.Types; IMPORT TextSearch.Inverted; @@ -30,6 +30,7 @@ EXPORT GROUPED DATASET(Posting) RawPostings(DATASET(Document) docIn) := FUNCTION Types.TermLength lenText; Types.KWP keywords; Types.Ordinal preorder; + END; StateRec := RECORD Types.Depth currDepth; @@ -49,6 +50,7 @@ EXPORT GROUPED DATASET(Posting) RawPostings(DATASET(Document) docIn) := FUNCTION SELF.preorder := IF(docChanged, 0, st.lastOrd) + 1; SELF.lenText := st.lenText; SELF.keywords := st.keywords; + END; StateRec initState() := TRANSFORM SELF.lastOrd := 0; @@ -107,6 +109,7 @@ EXPORT GROUPED DATASET(Posting) RawPostings(DATASET(Document) docIn) := FUNCTION incrOrdinal := IF(isElement(posting.typData), 1, 0); closeElement := posting.typData=DataType.EndElement; SELF.kwp := IF(docChanged, 1, st.nextKWP); + SELF.depth := IF(closeElement, st.currDepth-1, st.currDepth); SELF.parentOrd := toppreord; SELF.preorder := IF(docChanged, 0, st.lastOrd) + incrOrdinal; @@ -114,10 +117,11 @@ EXPORT GROUPED DATASET(Posting) RawPostings(DATASET(Document) docIn) := FUNCTION SELF.parentName := topParentName; SELF.lenText := IF(closeElement, st.lenText, posting.lenText); SELF.keywords := IF(closeElement, st.keywords, posting.keywords); + SELF.typData :=IF(SELF.depth>0 and posting.typData =Types.DataType.RawData ,Types.DataType.PCDATA,Types.DataType.RawData ); SELF := posting; END; initalV := ROW(initState()); p2 := PROCESS(p1, initalV, assign(LEFT,RIGHT), next(LEFT,RIGHT), LOCAL); p3 := GROUP(p2, id) : ONWARNING(1037, IGNORE); RETURN p3; -END; +END; \ No newline at end of file diff --git a/TextSearch/Inverted/SpecialPostings.ecl b/TextSearch/Inverted/SpecialPostings.ecl index ca2ef7d..104d08a 100644 --- a/TextSearch/Inverted/SpecialPostings.ecl +++ b/TextSearch/Inverted/SpecialPostings.ecl @@ -1,4 +1,4 @@ -// Make the special posting records. +// Make the special posting records. // Right now, the only special records are the document records for //the universal document set operation EXPORT SpecialPostings(DATASET(Layouts.Posting) inp) := FUNCTION diff --git a/TextSearch/Inverted/check.ecl b/TextSearch/Inverted/check.ecl new file mode 100644 index 0000000..3724229 --- /dev/null +++ b/TextSearch/Inverted/check.ecl @@ -0,0 +1,54 @@ +//EXPORT check := 'todo'; +IMPORT TextSearch.Inverted; +IMPORT TextSearch.Common; +IMPORT STD; + +prefix := '~thor::jdh::'; +inputName := prefix + 'corrected_lda_ap_txtt_xml'; + +Work1 := RECORD + UNICODE doc_number{XPATH('/DOC/DOCNO')}; + UNICODE content{MAXLENGTH(32000000),XPATH('<>')}; + UNICODE text{MAXLENGTH(32000000),XPATH('/DOC/TEXT')}; + UNSIGNED8 file_pos{VIRTUAL(fileposition)}; +END; + + + + +Inverted.Layouts.DocumentIngest cvt(Work1 lr) := TRANSFORM + SELF.identifier := TRIM(lr.doc_number, LEFT,RIGHT); + SELF.seqKey := inputName + '-' + INTFORMAT(lr.file_pos,12,1); + SELF.slugLine := lr.text[1..STD.Uni.Find(lr.text,'.',1)+1]; + SELF.content := lr.content; + SELF.init := lr.content; + +END; + + +stem := prefix + 'corrected_lda_ap_txtt_xml'; +instance := 'initial2'; + +inDocs := DATASET(inputName, Work1, XML('/DOC', NOROOT)); +ds1 := PROJECT(inDocs, cvt(LEFT)); +OUTPUT(ENTH(ds1, 20), NAMED('Sample_20'));//will print only 20 records + +info := Common.FileName_Info_Instance(stem, instance); + +enumDocs := Inverted.EnumeratedDocs(info, ds1); +p1 := Inverted.ParsedText(enumDocs); +rawPostings := Inverted.RawPostings(enumDocs); +OUTPUT(CHOOSEN(p1,30)); + + + + + + + + + + + + + diff --git a/TextSearch/Inverted/check2.ecl b/TextSearch/Inverted/check2.ecl new file mode 100644 index 0000000..94472c2 --- /dev/null +++ b/TextSearch/Inverted/check2.ecl @@ -0,0 +1,90 @@ +IMPORT TextSearch.Inverted; +IMPORT TextSearch.Common; +IMPORT STD; +#option('outputLimit',100); + + +prefix := '~thor::jdh::'; +inputName := prefix + 'corrected_lda_ap_txtt_xml'; + +Work1 := RECORD + UNICODE doc_number{XPATH('/DOC/DOCNO')}; + UNICODE content{MAXLENGTH(32000000),XPATH('<>')}; + UNICODE text{MAXLENGTH(32000000),XPATH('/DOC/TEXT')}; + UNSIGNED8 file_pos{VIRTUAL(fileposition)}; + set of String init; + // string init_w_pun; +END; + + +Inverted.Layouts.DocumentIngest cvt(Work1 lr) := TRANSFORM + SELF.identifier := TRIM(lr.doc_number, LEFT,RIGHT); + SELF.seqKey := inputName + '-' + INTFORMAT(lr.file_pos,12,1); + SELF.slugLine := lr.text[1..STD.Uni.Find(lr.text,'.',1)+1]; + SELF.content := lr.content; + SELF.init:=[]; +// SELF.init_w_pun:=[]; +END; + + +stem := prefix + 'corrected_lda_ap_txtt_xml'; +instance := 'initial2'; +expr:='[a-zA-Z][.][a-zA-Z]*[.][a-zA-Z]*[.]*[a-zA-Z]*'; + +ds0 := DATASET(inputName, Work1, XML('/DOC', NOROOT)); +inDocs := PROJECT(ds0, cvt(LEFT)); +//OUTPUT(ENTH(inDocs, 20), NAMED('Sample_20'));//will print only 20 records + +//prefix := '~thor::jdh::'; +//inputName := prefix + 'corrected_lda_ap_txtt_xml'; +//stem := prefix + 'corrected_lda_ap_txtt_xml'; +//instance := 'initial2'; + + + +//inDocs := DATASET(inputName, Inverted.Layouts.DocumentIngest, THOR); +OUTPUT(ENTH(inDocs, 20), NAMED('Sample_20'));//will print only 20 records +info := Common.FileName_Info_Instance(stem, instance); + +enumDocs := Inverted.EnumeratedDocs(info, inDocs); +rawPostings := Inverted.RawPostings(enumDocs); +OUTPUT(CHOOSEN(rawPostings,300), ALL, NAMED('First_300')); +selPostings := rawPostings(id=1 AND (start<100 OR start>3400)); +OUTPUT(selPostings, NAMED('Select_Doc_1')); +/* +t_len := TABLE(enumDocs, {id, INTEGER len:=LENGTH(CONTENT)}, id, LOCAL); +p_tab := TABLE(rawPostings, + {id, depth, + INTEGER kwds:=SUM(GROUP,keywords), INTEGER sum_lengths:=SUM(GROUP,lenText), + INTEGER min_kwp:=MIN(GROUP,kwp), INTEGER max_kwp:=MAX(GROUP,kwp), + INTEGER end_pos:=MAX(GROUP,stop)}, + id, depth, LOCAL); +pl_tab := JOIN(p_tab, t_len, LEFT.id=RIGHT.id, LOCAL); +OUTPUT(TOPN(pl_tab, 100, id, depth), NAMED('SUMMARY_100')); +*/ + integer i:=0; +t:=TABLE(rawPostings, + {id, term, + // INTEGER kwds:=SUM(GROUP,keywords), INTEGER sum_lengths:=SUM(GROUP,lenText), + //INTEGER min_kwp:=MIN(GROUP,kwp), INTEGER max_kwp:=MAX(GROUP,kwp), + //INTEGER end_pos:=MAX(GROUP,stop) + //String t:=term='.'; + + unicode t:=if(term='.' ,term+term[8],''); + //String t:=if(term='.' and term[2] !='',term+term[2],'') + // i:=i+1; + }, + id, term, LOCAL); + + output(t); + + + + + + + + + + + diff --git a/TextSearch/Inverted/check3.ecl b/TextSearch/Inverted/check3.ecl new file mode 100644 index 0000000..7a9b6fd --- /dev/null +++ b/TextSearch/Inverted/check3.ecl @@ -0,0 +1,186 @@ +//EXPORT try2 := 'todo'; + +//EXPORT solution := 'todo'; + +//EXPORT check := 'todo'; +IMPORT TextSearch.Inverted; +IMPORT TextSearch.Common; +IMPORT STD; +IMPORT TextSearch.Inverted.Layouts; + + + +prefix := '~thor::jdh::'; +inputName := prefix + 'corrected_lda_ap_txtt_xml'; + +Work1 := RECORD + UNICODE doc_number{XPATH('/DOC/DOCNO')}; + UNICODE content{MAXLENGTH(32000000),XPATH('<>')}; + UNICODE text{MAXLENGTH(32000000),XPATH('/DOC/TEXT')}; + UNSIGNED8 file_pos{VIRTUAL(fileposition)}; + UNICODE init; + +END; + + +Inverted.Layouts.DocumentIngest cvt(Work1 lr) := TRANSFORM + SELF.identifier := TRIM(lr.doc_number, LEFT,RIGHT); + SELF.seqKey := inputName + '-' + INTFORMAT(lr.file_pos,12,1); + SELF.slugLine := lr.text[1..STD.Uni.Find(lr.text,'.',1)+1]; + SELF.content := lr.content; + SELF.init:=lr.content; + +END; + + +stem := prefix + 'corrected_lda_ap_txtt_xml'; +instance := 'initial2'; + +ds0 := DATASET(inputName, Work1, XML('/DOC', NOROOT)); +inDocs := PROJECT(ds0, cvt(LEFT)); +OUTPUT(ENTH(inDocs, 20), NAMED('Sample_20'));//will print only 20 records + +info := Common.FileName_Info_Instance(stem, instance); + +/////////////////////////////////// +expr:=U'[a-zA-Z][.][a-zA-Z]*[.][a-zA-Z]*[.]*[a-zA-Z]*'; + + +cont:= RECORD + string term; + //inDocs.init_w_pun; +//set of string x; +END;; +Inverted.Layouts.DocumentIngest filter(Inverted.Layouts.DocumentIngest doc) := TRANSFORM +//init:=REGEXFINDSET( expr,(string)doc.content); +//SELF.content:=doc.content; +//SELF.init:=REGEXREPLACE( expr,doc.content,STD.Uni.FilterOut(doc.content, '.')); +SELF.init:=REGEXREPLACE( expr,doc.content,STD.Uni.FilterOut(doc.init, '.'));//+REGEXFINDSET(expr,doc.content); + + +//SELF.init_w_pun:=STD.Str.FilterOut((string)SELF.init, '.'); +//self.init:=STD.Str.FilterOut(REGEXFINDSET( expr,(string)doc.content), '.'); +//to change the field must use self.field +//add new column in data set and search in both +//output(init); +SELF := doc; +END; +s:= PROJECT(inDocs, filter(LEFT)); +OUTPUT(ENTH(s, 20), NAMED('Sample_200'));//will print only 20 records + +//output(s); +//output(REGEXFINDSET(expr,inDocs[1].content)); + +//////////////////////////////////// + + + + + +output(s[1].init,NAMED('Sin')); +enumDocs := Inverted.EnumeratedDocs(info, s); +p1 := Inverted.ParsedText(enumDocs); +rawPostings := Inverted.RawPostings(enumDocs); + +output(rawPostings[1]); + + +OUTPUT(inDocs,,'~ONLINE::Farah::OUT::Solution1',OVERWRITE); +OUTPUT(p1,,'~ONLINE::Farah::OUT::Solution2',OVERWRITE); +OUTPUT(rawPostings,,'~ONLINE::Farah::OUT::Solution3',OVERWRITE); + +OUTPUT(enumDocs,,'~ONLINE::Farah::OUT::Solution4',OVERWRITE); + + + + + + + + + + + + + +//OUTPUT(rawPostings,,'~ONLINE::Farah::OUT::Solution3',OVERWRITE); + + +/* +initialism:=REGEXFINDSET(expr,(string)inDocs[1].content); +output(initialism); +A :=STD.Str.FilterOut(initialism[1], '.'); +output(A); +*/ +/* +cont filters(Inverted.RawPostings doc) := TRANSFORM + + + SELF.term:=''; +SELF := doc; +END; +r:= PROJECT(inDocs, filters(LEFT)); +output(r); + + */ + + + + + + + + + + + + + + + + + + + + + + + + + +///////////////////////////////////////////////////////////////////////////////////////////////////// +////////////////////////////////////////////////////////////////////////////////////////////////////// + + + + + + + + + + + + + + + + OUTPUT(CHOOSEN(rawPostings,300), ALL, NAMED('First_300')); +selPostings := rawPostings(id=1 AND (start<100 OR start>3400)); +OUTPUT(selPostings, NAMED('Select_Doc_1')); + +t_len := TABLE(enumDocs, {id, INTEGER len:=LENGTH(init)}, id, LOCAL); +p_tab := TABLE(rawPostings, + {id, depth, + INTEGER kwds:=SUM(GROUP,keywords), INTEGER sum_lengths:=SUM(GROUP,lenText), + INTEGER min_kwp:=MIN(GROUP,kwp), INTEGER max_kwp:=MAX(GROUP,kwp), + INTEGER end_pos:=MAX(GROUP,stop)}, + id, depth, LOCAL); +pl_tab := JOIN(p_tab, t_len, LEFT.id=RIGHT.id, LOCAL); +OUTPUT(TOPN(pl_tab, 100, id, depth), NAMED('SUMMARY_100')); + + + + + + diff --git a/TextSearch/Inverted/initialism.ecl b/TextSearch/Inverted/initialism.ecl new file mode 100644 index 0000000..c0019f9 --- /dev/null +++ b/TextSearch/Inverted/initialism.ecl @@ -0,0 +1,108 @@ + +IMPORT TextSearch2.Inverted; +IMPORT TextSearch2.Common; +IMPORT STD; +IMPORT TextSearch2.Inverted.Layouts; + + +#option('outputLimit',100); + + +prefix := '~thor::jdh::'; +inputName := prefix + 'corrected_lda_ap_txtt_xml'; + +Work1 := RECORD + UNICODE doc_number{XPATH('/DOC/DOCNO')}; + UNICODE content{MAXLENGTH(32000000),XPATH('<>')}; + UNICODE text{MAXLENGTH(32000000),XPATH('/DOC/TEXT')}; + UNSIGNED8 file_pos{VIRTUAL(fileposition)}; + UNICODE init; + +END; + + +Inverted.Layouts.DocumentIngest cvt(Work1 lr) := TRANSFORM + SELF.identifier := TRIM(lr.doc_number, LEFT,RIGHT); + SELF.seqKey := inputName + '-' + INTFORMAT(lr.file_pos,12,1); + SELF.slugLine := lr.text[1..STD.Uni.Find(lr.text,'.',1)+1]; + SELF.content := lr.content; + SELF.init:=lr.content; + +END; + + +stem := prefix + 'corrected_lda_ap_txtt_xml'; +instance := 'initial2'; + +ds0 := DATASET(inputName, Work1, XML('/DOC', NOROOT)); +inDocs := PROJECT(ds0, cvt(LEFT)); + +info := Common.FileName_Info_Instance(stem, instance); + + +expr:=U'[a-zA-Z][.][a-zA-Z][.]*[a-zA-Z]*[.]*[a-zA-Z]*'; +expr2:='[a-zA-Z][.][a-zA-Z][.]*[a-zA-Z]*[.]*[a-zA-Z]*'; + + + + + +enumDocs:= Inverted.EnumeratedDocs(info, inDocs); +p1 := Inverted.ParsedText(enumDocs); +rawPostings := Inverted.RawPostings(enumDocs); + +OUTPUT(rawPostings); + + +ValRec := RECORD + unicode val; +END; +DNrec := RECORD + RawPostings ; + DATASET(ValRec) Values; +END; + +DNrec filter(rawPostings L) := TRANSFORM + SetStrVals := REGEXFINDSET(expr2,(STRING)L.term)+Std.Str.SplitWords((STRING)L.term,'.'); + ValuesDS := DATASET(SetStrVals,{STRING StrVal}); + SELF.Values := PROJECT(ValuesDS, + TRANSFORM(ValRec, + SELF.val := (unicode)Left.StrVal)); + SELF:=l; + + + +END; +NestedDS := PROJECT(rawPostings,filter(LEFT)); +NestedDS; + +OutRec := RECORD + RawPostings; + unicode val; + +END; + + + +res:=NORMALIZE(NestedDS,COUNT(LEFT.Values), + TRANSFORM(OutRec, + SELF.val := LEFT.Values[COUNTER].val,Self.term:=LEFT.Values[COUNTER].val,SELF.len:=length(LEFT.Values[COUNTER].val),SELF.kwp:=LEFT.kwp+COUNTER,SELF.keywords:=if(length(LEFT.Values[COUNTER].val)=1,1,LEFT.keywords) + ,SELF.lentext:=length(LEFT.Values[COUNTER].val),SELF.typterm:=if(length(LEFT.Values[COUNTER].val)=1,1,LEFT.typterm)/*,SELF.lp:=if(LEFT.lp=0,,LEFT.lp)*/; + SELF := LEFT, + )); + +output(res); + + +PATTERN expr3 :=PATTERN('[a-zA-Z][.][a-zA-Z]*[.][a-zA-Z]*[.]*[a-zA-Z]*'); +PATTERN expr4 :=PATTERN('[a-zA-Z][.][a-zA-Z]*'); +PATTERN expr5 :=PATTERN('[a-zA-Z]+'); + +TOKEN JustAWord := expr3 expr5; +RULE NounPhraseComp1 := JustAWord ; +ps1 := { + +out1 := MATCHTEXT(NounPhraseComp1) }; +p14 := PARSE(res, val, NounPhraseComp1, ps1, BEST,MANY,NOCASE); +output(p14,NAMED('Result_4')); + \ No newline at end of file diff --git a/TextSearch/Inverted/john1.ecl b/TextSearch/Inverted/john1.ecl new file mode 100644 index 0000000..efad4ed --- /dev/null +++ b/TextSearch/Inverted/john1.ecl @@ -0,0 +1,84 @@ + + +IMPORT TextSearch.Inverted; +IMPORT TextSearch.Common; +IMPORT STD; +IMPORT TextSearch.Inverted.Layouts; + + + +prefix := '~thor::jdh::'; +inputName := prefix + 'corrected_lda_ap_txtt_xml'; + +Work1 := RECORD + UNICODE doc_number{XPATH('/DOC/DOCNO')}; + UNICODE content{MAXLENGTH(32000000),XPATH('<>')}; + UNICODE text{MAXLENGTH(32000000),XPATH('/DOC/TEXT')}; + UNSIGNED8 file_pos{VIRTUAL(fileposition)}; + UNICODE init; + +END; + + +Inverted.Layouts.DocumentIngest cvt(Work1 lr) := TRANSFORM + SELF.identifier := TRIM(lr.doc_number, LEFT,RIGHT); + SELF.seqKey := inputName + '-' + INTFORMAT(lr.file_pos,12,1); + SELF.slugLine := lr.text[1..STD.Uni.Find(lr.text,'.',1)+1]; + SELF.content := lr.content; + SELF.init:=lr.content; + +END; + + +stem := prefix + 'corrected_lda_ap_txtt_xml'; +instance := 'initial2'; + +ds0 := DATASET(inputName, Work1, XML('/DOC', NOROOT)); +inDocs := PROJECT(ds0, cvt(LEFT)); +OUTPUT(ENTH(inDocs, 20), NAMED('Sample_20')); + +info := Common.FileName_Info_Instance(stem, instance); + + +expr:=U'[a-zA-Z][.][a-zA-Z]*[.][a-zA-Z]*[.]*[a-zA-Z]*'; + + +cont:= RECORD + string term; +END;; +Inverted.Layouts.DocumentIngest filter(Inverted.Layouts.DocumentIngest doc) := TRANSFORM +SELF.init:=REGEXREPLACE( expr,doc.content,STD.Uni.FilterOut(doc.init, '.')); + +SELF := doc; +END; +s:= PROJECT(inDocs, filter(LEFT)); +OUTPUT(ENTH(s, 20), NAMED('Sample_200')); +output(s[1].init,NAMED('Sin')); +output(inDocs[1].content,NAMED('Con')); + +enumDocs := Inverted.EnumeratedDocs(info, s); +p1 := Inverted.ParsedText(enumDocs); +rawPostings := Inverted.RawPostings(enumDocs); + +output(rawPostings[1]); + + +OUTPUT(inDocs,,'~ONLINE::Farah::OUT::Solution1',OVERWRITE); +OUTPUT(p1,,'~ONLINE::Farah::OUT::Solution2',OVERWRITE); +OUTPUT(rawPostings,,'~ONLINE::Farah::OUT::Solution3',OVERWRITE); + +OUTPUT(enumDocs,,'~ONLINE::Farah::OUT::Solution4',OVERWRITE); + +OUTPUT(CHOOSEN(rawPostings,300), ALL, NAMED('First_300')); +selPostings := rawPostings(id=1 AND (start<100 OR start>3400)); +OUTPUT(selPostings, NAMED('Select_Doc_1')); + +t_len := TABLE(enumDocs, {id, INTEGER len:=LENGTH(init)}, id, LOCAL); +p_tab := TABLE(rawPostings, + {id, depth, + INTEGER kwds:=SUM(GROUP,keywords), INTEGER sum_lengths:=SUM(GROUP,lenText), + INTEGER min_kwp:=MIN(GROUP,kwp), INTEGER max_kwp:=MAX(GROUP,kwp), + INTEGER end_pos:=MAX(GROUP,stop)}, + id, depth, LOCAL); +pl_tab := JOIN(p_tab, t_len, LEFT.id=RIGHT.id, LOCAL); +OUTPUT(TOPN(pl_tab, 100, id, depth), NAMED('SUMMARY_100')); diff --git a/TextSearch/Inverted/john2.ecl b/TextSearch/Inverted/john2.ecl new file mode 100644 index 0000000..43c2d18 --- /dev/null +++ b/TextSearch/Inverted/john2.ecl @@ -0,0 +1,46 @@ +IMPORT TextSearch.Inverted; +IMPORT TextSearch.Common; +Import STD; + + prefix := '~thor::jdh::'; +inputName := prefix + 'corrected_lda_ap_txtt_xml'; + +Work1 := RECORD + UNICODE doc_number{XPATH('/DOC/DOCNO')}; + UNICODE content{MAXLENGTH(32000000),XPATH('<>')}; + UNICODE text{MAXLENGTH(32000000),XPATH('/DOC/TEXT')}; + UNSIGNED8 file_pos{VIRTUAL(fileposition)}; + UNICODE init; + +END; + + +Inverted.Layouts.DocumentIngest cvt(Work1 lr) := TRANSFORM + SELF.identifier := TRIM(lr.doc_number, LEFT,RIGHT); + SELF.seqKey := inputName + '-' + INTFORMAT(lr.file_pos,12,1); + SELF.slugLine := lr.text[1..STD.Uni.Find(lr.text,'.',1)+1]; + SELF.content := lr.content; + SELF.init:=lr.content; + +END; + + + +ds0 := DATASET(inputName, Work1, XML('/DOC', NOROOT)); +inDocs := PROJECT(ds0, cvt(LEFT)); + +Work2 := RECORD + Common.Types.DocIdentifier doc_ident; + UNSIGNED4 start; + UNICODE content; +END; + +Work2 splitContent(Inverted.Layouts.DocumentIngest inp, UNSIGNED sub) := TRANSFORM + SELF.doc_ident := inp.identifier; + SELF.start := ((sub-1)*100) + 1; + SELF.content := inp.content[SELF.start..SELF.start+99]; +END; + +inParts := NORMALIZE(inDocs, ((LENGTH(LEFT.content)-1)/100)+1, splitContent(LEFT, COUNTER)); + +OUTPUT(CHOOSEN(inParts, 200), ALL, NAMED('First_200_blocks')); diff --git a/TextSearch/Inverted/moby.ecl b/TextSearch/Inverted/moby.ecl new file mode 100644 index 0000000..daf00f0 --- /dev/null +++ b/TextSearch/Inverted/moby.ecl @@ -0,0 +1,33 @@ +//EXPORT moby := 'todo'; + +#option('outputLimit',100); + +import std; +CSVRecord := RECORD + string word; + +END; + + file3 := DATASET('~thor::jdh::moby', + CSVrecord, + CSV(HEADING(1), + SEPARATOR([',']), + TERMINATOR(['\n']))); + +file3; + + +cont:= RECORD + + unicode term; + set of unicode synonyms; +END; +cont filter(file3 doc) := TRANSFORM + +SELF.term:=STD.STr.SplitWords(doc.word,',')[1]; //I've got all words +SELF.synonyms:=STD.STr.SplitWords(doc.word,',')[2..];// to return set of synonyms + +SELF := doc; +END; +s:= PROJECT(file3, filter(LEFT)); +output(s); diff --git a/TextSearch/Inverted/states.ecl b/TextSearch/Inverted/states.ecl new file mode 100644 index 0000000..3fcd815 --- /dev/null +++ b/TextSearch/Inverted/states.ecl @@ -0,0 +1,150 @@ + +IMPORT TextSearch2.Inverted; +IMPORT TextSearch2.Common; +IMPORT STD; +IMPORT TextSearch2.Inverted.Layouts; +Import python; + +#option('outputLimit',100); + +prefix := '~thor::jdh::'; +inputName := prefix + 'corrected_lda_ap_txtt_xml'; + +Work1 := RECORD + UNICODE doc_number{XPATH('/DOC/DOCNO')}; + UNICODE content{MAXLENGTH(32000000),XPATH('<>')}; + UNICODE text{MAXLENGTH(32000000),XPATH('/DOC/TEXT')}; + UNSIGNED8 file_pos{VIRTUAL(fileposition)}; + UNICODE init; + +END; + + +Inverted.Layouts.DocumentIngest cvt(Work1 lr) := TRANSFORM + SELF.identifier := TRIM(lr.doc_number, LEFT,RIGHT); + SELF.seqKey := inputName + '-' + INTFORMAT(lr.file_pos,12,1); + SELF.slugLine := lr.text[1..STD.Uni.Find(lr.text,'.',1)+1]; + SELF.content := lr.content; + SELF.init:=lr.content; + +END; + + +stem := prefix + 'corrected_lda_ap_txtt_xml'; +instance := 'initial2'; + +ds0 := DATASET(inputName, Work1, XML('/DOC', NOROOT)); +inDocs := PROJECT(ds0, cvt(LEFT)); + +info := Common.FileName_Info_Instance(stem, instance); + + + +enumDocs := Inverted.EnumeratedDocs(info, inDocs); +p1 := Inverted.ParsedText(enumDocs); +rawPostings := Inverted.RawPostings(enumDocs); + +OUTPUT(rawPostings,,'~ONLINE::Farah::OUT::Solution1',OVERWRITE); + + +rec := RECORD + UNICODE code; + UNICODE state; +END; +Ds := DATASET([{'AK', 'Alaska'}, + {'AL', 'Alabama'}, + {'AR', 'Arkansas'}, + {'AS', 'American Samoa'}, + {'AZ', 'Arizona'}, + {'CA', 'California'}, + {'CO', 'Colorado'}, + {'CT', 'Connecticut'}, + {'DC', 'District of Columbia'}, + {'DE', 'Delaware'}, + {'FL', 'Florida'}, + {'GA', 'Georgia'}, + {'GU', 'Guam'}, + {'HI', 'Hawaii'}, + {'IA', 'Iowa'}, + {'ID', 'Idaho'}, + {'IL', 'Illinois'}, + {'IN', 'Indiana'}, + {'KS', 'Kansas'}, + {'KY', 'Kentucky'}, + {'LA', 'Louisiana'}, + {'MA', 'Massachusetts'}, + {'MD', 'Maryland'}, + {'ME', 'Maine'}, + {'MI', 'Michigan'}, + {'MN', 'Minnesota'}, + {'MO', 'Missouri'}, + {'MP', 'Northern Mariana Islands'}, + {'MS', 'Mississippi'}, + {'MT', 'Montana'}, + {'NA', 'National'}, + {'NC', 'North Carolina'}, + {'ND', 'North Dakota'}, + {'NE', 'Nebraska'}, + {'NH', 'New Hampshire'}, + {'NJ', 'New Jersey'}, + {'NM', 'New Mexico'}, + {'NV', 'Nevada'}, + {'NY', 'New York'}, + {'OH', 'Ohio'}, + {'OK', 'Oklahoma'}, + {'OR', 'Oregon'}, + {'PA', 'Pennsylvania'}, + {'PR', 'Puerto Rico'}, + {'RI', 'Rhode Island'}, + {'SC', 'South Carolina'}, + {'SD', 'South Dakota'}, + {'TN', 'Tennessee'}, + {'TX', 'Texas'}, + {'UT', 'Utah'}, + {'VA', 'Virginia'}, + {'VI', 'Virgin Islands'}, + {'VT', 'Vermont'}, + {'WA', 'Washington'}, + {'WI', 'Wisconsin'}, + {'WV', 'West Virginia'}, + {'WY', 'Wyoming'}],rec); + + + +DsDCT := DICTIONARY(DS,{code => DS}); +DsDCT2 := DICTIONARY(DS,{state => DS}); + + +OUTPUT(rawPostings[0].term IN DsDCT2); + +cont:= RECORD + + rawPostings.term; + +END;; +cont filter(Inverted.Layouts.RawPosting doc) := TRANSFORM + +SELF.term:=if(doc.term IN DsDCT or doc.term IN DsDCT2,doc.term,'');; + +SELF := doc; +END; +s:= PROJECT(rawPostings, filter(LEFT)); +output(s); + + +ValRec := RECORD + unicode val; +END; +DNrec := RECORD + RawPostings ; + +END; + +DNrec filter3(rawPostings L) := TRANSFORM + unicode t:=L.term; + SELF.term:=if(L.term IN DsDCT or L.term IN DsDCT2,t,L.term);; + SELF:=l; + +END; +NestedDS := PROJECT(rawPostings,filter3(LEFT)); +output(NestedDS) diff --git a/TextSearch/Inverted/test_moby.ecl b/TextSearch/Inverted/test_moby.ecl new file mode 100644 index 0000000..06bf8f6 --- /dev/null +++ b/TextSearch/Inverted/test_moby.ecl @@ -0,0 +1,52 @@ +#option('outputLimit',100); + +import std; +CSVRecord := RECORD + string word; + +END; + + file3 := DATASET('~thor::jdh::moby', + CSVrecord, + CSV(HEADING(1), + SEPARATOR([',']), + TERMINATOR(['\n']))); + +file3; + + +cont:= RECORD + + unicode term; + set of unicode synonyms; +END; +cont filter(file3 doc) := TRANSFORM + +SELF.term:=STD.STr.SplitWords(doc.word,',')[1]; //I've got all words +SELF.synonyms:=STD.STr.SplitWords(doc.word,',')[2..];// to return set of synonyms + +SELF := doc; +END; +s:= PROJECT(file3, filter(LEFT)); +//output(s); + +unicode t:='Abaddon'; +output(s); +//res:=if(s[0]=t,s[1],[]); +//output(res) + + +cont2 := RECORD + unicode term; + set of unicode synonoms; + +END; +cont2 filter2(file3 doc) := TRANSFORM + +SELF.term:=if(STD.STr.SplitWords(doc.word,',')[1]=t,STD.STr.SplitWords(doc.word,',')[1],''); + +SELF.synonoms:=if(STD.STr.SplitWords(doc.word,',')[1]=t,STD.STr.SplitWords(doc.word,',')[2..],[]); //I've got all words + +END; +s2:= PROJECT(file3, filter2(LEFT)); + output(s2); \ No newline at end of file diff --git a/TextSearch/Inverted/try2.ecl b/TextSearch/Inverted/try2.ecl new file mode 100644 index 0000000..0f93474 --- /dev/null +++ b/TextSearch/Inverted/try2.ecl @@ -0,0 +1,133 @@ +//EXPORT try2 := 'todo'; + +//EXPORT solution := 'todo'; + +//EXPORT check := 'todo'; +IMPORT TextSearch.Inverted; +IMPORT TextSearch.Common; +IMPORT STD; +IMPORT TextSearch.Inverted.Layouts; + +#option('outputLimit',100); + + + +prefix := '~thor::jdh::'; +inputName := prefix + 'corrected_lda_ap_txtt_xml'; + +Work1 := RECORD + UNICODE doc_number{XPATH('/DOC/DOCNO')}; + UNICODE content{MAXLENGTH(32000000),XPATH('<>')}; + UNICODE text{MAXLENGTH(32000000),XPATH('/DOC/TEXT')}; + UNSIGNED8 file_pos{VIRTUAL(fileposition)}; + UNICODE init; + +END; + + +Inverted.Layouts.DocumentIngest cvt(Work1 lr) := TRANSFORM + SELF.identifier := TRIM(lr.doc_number, LEFT,RIGHT); + SELF.seqKey := inputName + '-' + INTFORMAT(lr.file_pos,12,1); + SELF.slugLine := lr.text[1..STD.Uni.Find(lr.text,'.',1)+1]; + SELF.content := lr.content; + SELF.init:=lr.content; + +END; + + +stem := prefix + 'corrected_lda_ap_txtt_xml'; +instance := 'initial2'; + +ds0 := DATASET(inputName, Work1, XML('/DOC', NOROOT)); +inDocs := PROJECT(ds0, cvt(LEFT)); +OUTPUT(ENTH(inDocs, 20), NAMED('Sample_20'));//will print only 20 records + +info := Common.FileName_Info_Instance(stem, instance); + +/////////////////////////////////// +expr:=U'[a-zA-Z][.][a-zA-Z]*[.][a-zA-Z]*[.]*[a-zA-Z]*'; + + +cont:= RECORD + string term; + //inDocs.init_w_pun; +//set of string x; +END;; +Inverted.Layouts.DocumentIngest filter(Inverted.Layouts.DocumentIngest doc) := TRANSFORM + +//SELF.init:=REGEXREPLACE( expr,doc.content,STD.Uni.FilterOut(doc.init, '.'));//+REGEXFINDSET(expr,doc.content); + +SELF.init:=REGEXREPLACE( expr,doc.content,STD.Uni.FilterOut(doc.init, '.')); + + + + +SELF := doc; +END; +s:= PROJECT(inDocs, filter(LEFT)); +//OUTPUT(ENTH(s, 20),,'~tests' ,NAMED('Sample_200'));//will print only 20 records + +//output(s); +//output(REGEXFINDSET(expr,inDocs[1].content)); + +//////////////////////////////////// + + + + + + +output(inDocs[1].content,NAMED('Before_init')); +output(s[1].init,NAMED('After_init')); + +enumDocs := Inverted.EnumeratedDocs(info, s); +p1 := Inverted.ParsedText(enumDocs); +rawPostings := Inverted.RawPostings(enumDocs); + + + +//OUTPUT(inDocs,,'~ONLINE::Farah::OUT::Solution1',OVERWRITE); +//OUTPUT(p1,,'~ONLINE::Farah::OUT::Solution2',OVERWRITE); +//OUTPUT(rawPostings,,'~ONLINE::Farah::OUT::Solution3',OVERWRITE); + +//OUTPUT(enumDocs,,'~ONLINE::Farah::OUT::Solution4',OVERWRITE); + + + +//OUTPUT(ENTH(rawPostings[1]), NAMED('Posting'));//will print only 20 records + + +output(rawPostings,NAMED('Posting')); +output(p1,NAMED('parsed')); + + + + + + + + + +//OUTPUT(rawPostings,,'~ONLINE::Farah::OUT::Solution3',OVERWRITE); + + +/* +initialism:=REGEXFINDSET(expr,(string)inDocs[1].content); +output(initialism); +A :=STD.Str.FilterOut(initialism[1], '.'); +output(A); +*/ +/* +cont filters(Inverted.RawPostings doc) := TRANSFORM + + + SELF.term:=''; +SELF := doc; +END; +r:= PROJECT(inDocs, filters(LEFT)); +output(r); + + */ + + + \ No newline at end of file diff --git a/TextSearch/Inverted/try4.ecl b/TextSearch/Inverted/try4.ecl new file mode 100644 index 0000000..cbd5d33 --- /dev/null +++ b/TextSearch/Inverted/try4.ecl @@ -0,0 +1,247 @@ +//EXPORT try2 := 'todo'; + +//EXPORT solution := 'todo'; + +//EXPORT check := 'todo'; +IMPORT TextSearch.Inverted; +IMPORT TextSearch.Common; +IMPORT STD; +IMPORT TextSearch.Inverted.Layouts; + + + + +prefix := '~thor::jdh::'; +inputName := prefix + 'corrected_lda_ap_txtt_xml'; + +Work1 := RECORD + UNICODE doc_number{XPATH('/DOC/DOCNO')}; + UNICODE content{MAXLENGTH(32000000),XPATH('<>')}; + UNICODE text{MAXLENGTH(32000000),XPATH('/DOC/TEXT')}; + UNSIGNED8 file_pos{VIRTUAL(fileposition)}; + UNICODE init; + +END; + + +Inverted.Layouts.DocumentIngest cvt(Work1 lr) := TRANSFORM + SELF.identifier := TRIM(lr.doc_number, LEFT,RIGHT); + SELF.seqKey := inputName + '-' + INTFORMAT(lr.file_pos,12,1); + SELF.slugLine := lr.text[1..STD.Uni.Find(lr.text,'.',1)+1]; + SELF.content := lr.content; + SELF.init:=lr.content; + +END; + + +stem := prefix + 'corrected_lda_ap_txtt_xml'; +instance := 'initial2'; + +ds0 := DATASET(inputName, Work1, XML('/DOC', NOROOT)); +inDocs := PROJECT(ds0, cvt(LEFT)); +//OUTPUT(ENTH(inDocs, 20), NAMED('Sample_20'));//will print only 20 records + +info := Common.FileName_Info_Instance(stem, instance); + +/////////////////////////////////// +expr:=U'[a-zA-Z][.][a-zA-Z][.]*[a-zA-Z]*[.]*[a-zA-Z]*'; + + + +Inverted.Layouts.RawPosting filter(Inverted.Layouts.RawPosting doc) := TRANSFORM + +//SELF.init:=REGEXREPLACE( expr,doc.content,STD.Uni.FilterOut(doc.init, '.'));//+REGEXFINDSET(expr,doc.content); + +SELF.term:=REGEXREPLACE( expr,doc.term,STD.Uni.FilterOut(doc.term, '.')); + +SELF := doc; +END; + +//OUTPUT(ENTH(s, 20),,'~tests' ,NAMED('Sample_200'));//will print only 20 records + +//output(s); +//output(REGEXFINDSET(expr,inDocs[1].content)); + +//////////////////////////////////// + +Inverted.Layouts.RawPosting filter2(Inverted.Layouts.RawPosting doc) := TRANSFORM + +//SELF.init:=REGEXREPLACE( expr,doc.content,STD.Uni.FilterOut(doc.init, '.'));//+REGEXFINDSET(expr,doc.content); + +SELF.term:=STD.Uni.FindReplace(doc.term,'.','\n'); + + + +SELF := doc; +END; + + + +Inverted.Layouts.DocumentIngest filter3(Inverted.Layouts.DocumentIngest doc) := TRANSFORM + +//SELF.init:=REGEXREPLACE( expr,doc.content,STD.Uni.FilterOut(doc.init, '.'));//+REGEXFINDSET(expr,doc.content); + +SELF.content:=STD.Uni.FindReplace(doc.content,'.',' '); + + + + +SELF := doc; +END; + + +//output(inDocs[1].content,NAMED('Before_init')); +//output(s[1].init,NAMED('After_init')); + +enumDocs := Inverted.EnumeratedDocs(info, inDocs); +p1 := Inverted.ParsedText(enumDocs); +rawPostings := Inverted.RawPostings(enumDocs); + +s:= PROJECT(rawPostings, filter(LEFT)); +s2:=PROJECT(rawPostings, filter2(LEFT)); + + + +//output(s); + +//OUTPUT(inDocs,,'~ONLINE::Farah::OUT::Solution1',OVERWRITE); +OUTPUT(rawPostings,,'~ONLINE::Farah::OUT::Solution3',OVERWRITE); +OUTPUT(s,,'~ONLINE::Farah::OUT::Solution2',OVERWRITE); +OUTPUT(s2,,'~ONLINE::Farah::OUT::Solution4',OVERWRITE); + + +enum2:=PROJECT(inDocs, filter3(LEFT)); +OUTPUT(enum2[1].content,named('farah')); + +enumDocs2 := Inverted.EnumeratedDocs(info, enum2); +//p11 := Inverted.ParsedText(enumDocs2); +rawPostings2 := Inverted.RawPostings(enumDocs2); +OUTPUT(rawPostings2,,'~ONLINE::Farah::OUT::Solution7',OVERWRITE); + + +//OUTPUT(enumDocs,,'~ONLINE::Farah::OUT::Solution4',OVERWRITE); + + + +//OUTPUT(ENTH(rawPostings[1]), NAMED('Posting'));//will print only 20 records + + +//output(rawPostings,NAMED('Posting')); +//output(p1,NAMED('parsed')); + + + + + + + + + +//OUTPUT(rawPostings,,'~ONLINE::Farah::OUT::Solution3',OVERWRITE); + + +/* +initialism:=REGEXFINDSET(expr,(string)inDocs[1].content); +output(initialism); +A :=STD.Str.FilterOut(initialism[1], '.'); +output(A); +*/ +/* +cont filters(Inverted.RawPostings doc) := TRANSFORM + + + SELF.term:=''; +SELF := doc; +END; +r:= PROJECT(inDocs, filters(LEFT)); +output(r); + + */ +e:=REGEXREPLACE( expr,inDocs[1].content ,STD.Uni.FilterOut(inDocs[1].content, '.')); + +output(e); + + + +ds := DATASET([{'thee is anew A.B.C and V.R'}], {STRING100 line}); + + +PATTERN expr2 :=PATTERN(U'[a-zA-Z][.][a-zA-Z]*[.][a-zA-Z]*[.]*[a-zA-Z]*'); + + +PATTERN ws := PATTERN('[ \t\r\n]'); + + + + + +PATTERN Alpha := PATTERN('[A-Za-z]'); + + + +PATTERN Word := Alpha+; + + + +PATTERN Article := ['the', 'A']; + + + +TOKEN JustAWord := expr2 ; + + + +PATTERN notHen := VALIDATE(Word, MATCHTEXT != 'hen'); + + + +TOKEN NoHenWord := notHen ; + + + +RULE NounPhraseComp1 := JustAWord ; + +RULE NounPhraseComp2 := NoHenWord | Article ws Word; +//RULE Noun3 := NounPhraseComp1 , NounPhraseComp2; + + +ps1 := { + + + +out1 := MATCHTEXT(NounPhraseComp1) }; + +ps2 := { + +out2 := MATCHTEXT(NounPhraseComp2) }; + +//ps3 := { + + + +//out3 := MATCHTEXT(Noun3) }; + + + +p11 := PARSE(ds, line, NounPhraseComp1, ps1, BEST,MANY,NOCASE); + +p22 := PARSE(ds, line, NounPhraseComp2, ps2, BEST,MANY,NOCASE); +//p33 := PARSE(ds, line, Noun3, ps3, BEST,MANY,NOCASE); + +output(p11); +output(p22); +//output(p33); + + p111 := PARSE(inDocs, content, NounPhraseComp1, ps1, BEST,MANY,NOCASE); + output(p111); + //pr := Inverted.ParsedText(p111); +//sss:=REGEXREPLACE( expr,p111[1],STD.Uni.FilterOut(doc.init, '.')); +//output(p111); + p222 := PARSE(inDocs, content, NounPhraseComp2, ps2, BEST,MANY,NOCASE); + output(p222); + output(p111+p222); + + + + + \ No newline at end of file diff --git a/TextSearch/Inverted/word2vec_1.ecl b/TextSearch/Inverted/word2vec_1.ecl new file mode 100644 index 0000000..7e6cf49 --- /dev/null +++ b/TextSearch/Inverted/word2vec_1.ecl @@ -0,0 +1,102 @@ + +IMPORT Python; +#option('outputLimit',100); + + +namerec := RECORD + string name; +END; + + + + + +IMPORT TextSearch2.Inverted; +IMPORT TextSearch2.Common; +IMPORT STD; +IMPORT TextSearch2.Inverted.Layouts; + + + + +prefix := '~thor::jdh::'; +inputName := prefix + 'corrected_lda_ap_txtt_xml'; + +Work1 := RECORD + UNICODE doc_number{XPATH('/DOC/DOCNO')}; + UNICODE content{MAXLENGTH(32000000),XPATH('<>')}; + UNICODE text{MAXLENGTH(32000000),XPATH('/DOC/TEXT')}; + UNSIGNED8 file_pos{VIRTUAL(fileposition)}; + UNICODE init; + +END; + + +Inverted.Layouts.DocumentIngest cvt(Work1 lr) := TRANSFORM + SELF.identifier := TRIM(lr.doc_number, LEFT,RIGHT); + SELF.seqKey := inputName + '-' + INTFORMAT(lr.file_pos,12,1); + SELF.slugLine := lr.text[1..STD.Uni.Find(lr.text,'.',1)+1]; + SELF.content := lr.content; + SELF.init:=lr.content; + +END; + + +stem := prefix + 'corrected_lda_ap_txtt_xml'; +instance := 'initial2'; + +ds0 := DATASET(inputName, Work1, XML('/DOC', NOROOT)); +inDocs := PROJECT(ds0, cvt(LEFT)); + +info := Common.FileName_Info_Instance(stem, instance); + + +expr:=U'[a-zA-Z][.][a-zA-Z][.]*[a-zA-Z]*[.]*[a-zA-Z]*'; +expr2:='[a-zA-Z][.][a-zA-Z][.]*[a-zA-Z]*[.]*[a-zA-Z]*'; + + + + + +enumDocs := Inverted.EnumeratedDocs(info, inDocs); +p1 := Inverted.ParsedText(enumDocs); +rawPostings := Inverted.RawPostings(enumDocs); + +OUTPUT(enumDocs,,'~ONLINE::Farah::OUT::Solution77',OVERWRITE); + +rec0 := RECORD + unicode cell; +END; + +rec := RECORD +DATASET(rec0) arow; +END; + + + + +import python; +DATASET(rec0) word2vec(dataset(Inverted.Layouts.DocumentIngest) A) := embed(Python) + + + import numpy as np + import re + import gensim + + s=[] + for n in A: + s.append(gensim.utils.simple_preprocess(unicode(n.content))) + model = gensim.models.Word2Vec(s,size=150,window=10,min_count=2,workers=10) + model.train(s,total_examples=len(s),epochs=10) + w1 = "school" + r= model.wv.most_similar(positive=w1) + return r + +endembed; + + + + + + + OUTPUT(CHOOSEN(word2vec(inDocs), 200), ALL, NAMED('First_200_blocks')); diff --git a/TextSearch/Inverted/word2vec_2.ecl b/TextSearch/Inverted/word2vec_2.ecl new file mode 100644 index 0000000..a19258e --- /dev/null +++ b/TextSearch/Inverted/word2vec_2.ecl @@ -0,0 +1,108 @@ + +IMPORT Python; +#option('outputLimit',100); + +namerec := RECORD + string name; +END; + + + + + +IMPORT TextSearch2.Inverted; +IMPORT TextSearch2.Common; +IMPORT STD; +IMPORT TextSearch2.Inverted.Layouts; + + + + +prefix := '~thor::jdh::'; +inputName := prefix + 'corrected_lda_ap_txtt_xml'; + +Work1 := RECORD + UNICODE doc_number{XPATH('/DOC/DOCNO')}; + UNICODE content{MAXLENGTH(32000000),XPATH('<>')}; + UNICODE text{MAXLENGTH(32000000),XPATH('/DOC/TEXT')}; + UNSIGNED8 file_pos{VIRTUAL(fileposition)}; + UNICODE init; + +END; + + +Inverted.Layouts.DocumentIngest cvt(Work1 lr) := TRANSFORM + SELF.identifier := TRIM(lr.doc_number, LEFT,RIGHT); + SELF.seqKey := inputName + '-' + INTFORMAT(lr.file_pos,12,1); + SELF.slugLine := lr.text[1..STD.Uni.Find(lr.text,'.',1)+1]; + SELF.content := lr.content; + SELF.init:=lr.content; + +END; + + +stem := prefix + 'corrected_lda_ap_txtt_xml'; +instance := 'initial2'; + +ds0 := DATASET(inputName, Work1, XML('/DOC', NOROOT)); +inDocs := PROJECT(ds0, cvt(LEFT)); + +info := Common.FileName_Info_Instance(stem, instance); + + +expr:=U'[a-zA-Z][.][a-zA-Z][.]*[a-zA-Z]*[.]*[a-zA-Z]*'; +expr2:='[a-zA-Z][.][a-zA-Z][.]*[a-zA-Z]*[.]*[a-zA-Z]*'; + + + + + +enumDocs := Inverted.EnumeratedDocs(info, inDocs); +p1 := Inverted.ParsedText(enumDocs); +rawPostings := Inverted.RawPostings(enumDocs); + +OUTPUT(enumDocs,,'~ONLINE::Farah::OUT::Solution77',OVERWRITE); + +rec0 := RECORD + set of unicode cell; +END; + +rec := RECORD +DATASET(rec0) arow; +END; + + + + +import python; +DATASET(rec0) word2vec(dataset(Inverted.Layouts.DocumentIngest) A, unicode word) := embed(Python) + + + import numpy as np + import re + import gensim + + s=[] + for n in A: + s.append(gensim.utils.simple_preprocess(unicode(n.content))) + model = gensim.models.Word2Vec(s,size=150,window=10,min_count=2,workers=10) + model.train(s,total_examples=len(s),epochs=10) + w1 =word.split() + r=[] + for i in w1: + r.append([i,unicode(model.wv.most_similar(positive=(i)))]) + return r + +endembed; + + + + + query:=u'students in school' ; + + +res:=word2vec(inDocs,query); +Output(res); + + + diff --git a/TextSearch/Inverted/word2vec_3.ecl b/TextSearch/Inverted/word2vec_3.ecl new file mode 100644 index 0000000..2616f23 --- /dev/null +++ b/TextSearch/Inverted/word2vec_3.ecl @@ -0,0 +1,144 @@ + +IMPORT Python; +#option('outputLimit',100); + +namerec := RECORD + string name; +END; + + + + + +IMPORT TextSearch2.Inverted; +IMPORT TextSearch2.Common; +IMPORT STD; +IMPORT TextSearch2.Inverted.Layouts; + + + + +prefix := '~thor::jdh::'; +inputName := prefix + 'corrected_lda_ap_txtt_xml'; + +Work1 := RECORD + UNICODE doc_number{XPATH('/DOC/DOCNO')}; + UNICODE content{MAXLENGTH(32000000),XPATH('<>')}; + UNICODE text{MAXLENGTH(32000000),XPATH('/DOC/TEXT')}; + UNSIGNED8 file_pos{VIRTUAL(fileposition)}; + UNICODE init; + +END; + + +Inverted.Layouts.DocumentIngest cvt(Work1 lr) := TRANSFORM + SELF.identifier := TRIM(lr.doc_number, LEFT,RIGHT); + SELF.seqKey := inputName + '-' + INTFORMAT(lr.file_pos,12,1); + SELF.slugLine := lr.text[1..STD.Uni.Find(lr.text,'.',1)+1]; + SELF.content := lr.content; + SELF.init:=lr.content; + +END; + + +stem := prefix + 'corrected_lda_ap_txtt_xml'; +instance := 'initial2'; + +ds0 := DATASET(inputName, Work1, XML('/DOC', NOROOT)); +inDocs := PROJECT(ds0, cvt(LEFT)); + +info := Common.FileName_Info_Instance(stem, instance); + + +expr:=U'[a-zA-Z][.][a-zA-Z][.]*[a-zA-Z]*[.]*[a-zA-Z]*'; +expr2:='[a-zA-Z][.][a-zA-Z][.]*[a-zA-Z]*[.]*[a-zA-Z]*'; + + + + + +OUTPUT(inDocs); + + + + +rec0 := RECORD + unicode cell; +END; + +rec := RECORD +DATASET(rec0) arow; +END; + + + + +import python; +DATASET(rec0) word2vec(dataset(Inverted.Layouts.DocumentIngest) A, unicode word) := embed(Python) + + + import numpy as np + import re + import gensim + + s=[] + for n in A: + s.append(gensim.utils.simple_preprocess(unicode(n.content))) + model = gensim.models.Word2Vec(s,size=150,window=10,min_count=2,workers=10) + model.train(s,total_examples=len(s),epochs=10) + w1 =word.split() + r=[] + for i in w1: + r.append([i,unicode(model.wv.most_similar(positive=(i)))]) + + return (r[0][1]).split(',') + +endembed; + + + + + query:=u'students in school' ; + + + +res:=word2vec(inDocs,query); +Output(res); + + +rec2 := RECORD + DATASET (Inverted.Layouts.DocumentIngest) cell; +END; + Dataset(rec2) filter(dataset(Inverted.Layouts.DocumentIngest) A, DATASET (rec0) B) := embed(Python) + + import numpy as np + import re + import gensim + s=[] + r=[] + m=[] + l=[] + + for i in B: + for n in A: + if (unicode (n.content).find(unicode(i.cell))!=0): + if (n.content not in m): + m.append([n.content]) + l.append([n]) + + + + return l +endembed; + +res2:=filter(inDocs,res); +Output(res2); +OUTPUT(CHOOSEN(res2, 100), ALL, NAMED('First_100_blocks')); + + + + + + + + \ No newline at end of file