ftyers · dasinitsyna · Oct 30, 2018 · Oct 30, 2018 · Oct 30, 2018 · Oct 30, 2018
diff --git a/...racticals/morphological-disambiguation/morphological_disambiguation-response.md b/...racticals/morphological-disambiguation/morphological_disambiguation-response.md
@@ -0,0 +1,115 @@
+## Morphological disambiguation
+### Part-of-Speech Tagger comparison
+I've decided to compare three part of speech taggers on the UD Bulgarian corpus. First, I've used UDPipe, as it says in the task, and achieved these results:
+~~~~
+Metrics    | Precision |    Recall |  F1 Score | AligndAcc
+-----------+-----------+-----------+-----------+-----------
+UPOS       |     97.81 |     97.81 |     97.81 |     97.81
+~~~~
+
+### Constraint Grammar
+I've tried to write down the rules to disambiguate the example sentence. The only tags I couldn't rid of were the ones that weren't POS since they have a different structure I couldn't work around of. 
+Here are the rules that worked:
+
+~~~~
+DELIMITERS = "." ;
+
+LIST DET = DET ;
+LIST PUNCT = PUNCT ;
+LIST NOUN = NOUN ;
+LIST VERB = VERB ;
+LIST PRON = PRON ;
+LIST ADP = ADP ;
+LIST PART = PART ;
+
+SECTION
+
+REMOVE DET IF (1C PUNCT) ;
+REMOVE PRON IF (1 VERB OR NOUN) ;
+REMOVE DET IF (-1C ADP) ;
+REMOVE PART ;
+~~~~
+
+And the ones that didn't:
+~~~~
+LIST Gen = Gen ;
+LIST Acc = Acc ;
+LIST Nom = Nom ; 
+
+REMOVE Acc IF (-1C NOUN) ;
+REMOVE Nom IF (-1C ADP) ;
+~~~~
+
+In the end, I was left with this:
+~~~~
+"<Однако>"
+	"однако" ADV Degree=Pos
+"<стиль>"
+	"стиль" NOUN Animacy=Inan Case=Nom Gender=Masc Number=Sing
+	"стиль" NOUN Animacy=Inan Case=Acc Gender=Masc Number=Sing
+"<работы>"
+	"работа" NOUN Animacy=Inan Case=Gen Gender=Fem Number=Sing
+	"работа" NOUN Animacy=Inan Case=Nom Gender=Fem Number=Plur
+	"работа" NOUN Animacy=Inan Case=Acc Gender=Fem Number=Plur
+"<Семена>"
+	"Семен" PROPN Animacy=Anim Case=Gen Gender=Masc Number=Sing
+	"Семен" PROPN Animacy=Anim Case=Acc Gender=Masc Number=Sing
+"<Еремеевича>"
+	"Еремеевич" PROPN Animacy=Anim Case=Gen Gender=Masc Number=Sing
+"<заключался>"
+	"заключаться" VERB Aspect=Imp Gender=Masc Mood=Ind Number=Sing Tense=Past VerbForm=Fin Voice=Mid
+"<в>"
+	"в" ADP
+"<том>"
+	"то" PRON Animacy=Inan Case=Loc Gender=Neut Number=Sing
+;	"тот" DET Case=Loc Gender=Neut Number=Sing REMOVE:16
+;	"тот" DET Case=Loc Gender=Masc Number=Sing REMOVE:16
+"<,>"
+	"," PUNCT
+"<чтобы>"
+	"чтобы" SCONJ Mood=Cnd
+"<принимать>"
+	"принимать" VERB Aspect=Imp VerbForm=Inf Voice=Act
+"<всех>"
+	"весь" DET Case=Gen Number=Plur
+	"весь" DET Case=Loc Number=Plur
+	"весь" DET Case=Acc Number=Plur
+;	"все" PRON Animacy=Anim Case=Acc Number=Plur REMOVE:20
+;	"все" PRON Animacy=Anim Case=Gen Number=Plur REMOVE:20
+"<желающих>"
+	"желать" VERB Aspect=Imp Case=Gen Number=Plur Tense=Pres VerbForm=Part Voice=Act
+"<и>"
+	"и" CCONJ
+;	"и" PART REMOVE:26
+"<лично>"
+	"лично" ADV Degree=Pos
+"<вникать>"
+	"*вникать"
+"<в>"
+	"в" ADP
+"<дело>"
+	"дело" NOUN Animacy=Inan Case=Nom Gender=Neut Number=Sing
+	"дело" NOUN Animacy=Inan Case=Acc Gender=Neut Number=Sing
+"<.>"
+	"." PUNCT
+~~~~
+### Improving perceptron tagger
+Here are the orginal results of the perceptron tagger on Spanish UD:
+~~~~
+Metrics    | Precision |    Recall |  F1 Score | AligndAcc
+-----------+-----------+-----------+-----------+-----------
+UPOS       |     94.72 |     94.72 |     94.72 |     94.72
+~~~~
+Here, I've managed to slighlty improve the perceptron used on *UD Spanish* by changing the parameters with suffix and pref1. 
+I've changed the length of pref1 to *add('i pref1', word[0:3])* and got the following results:
+~~~~
+Metrics    | Precision |    Recall |  F1 Score | AligndAcc
+-----------+-----------+-----------+-----------+-----------
+UPOS       |     95.58 |     95.58 |     95.58 |     95.58
+~~~~
+Then, I've played a bit with the suffix parameters, changing all of the suffix instances from [-3] to [-2]:
+~~~~
+Metrics    | Precision |    Recall |  F1 Score | AligndAcc
+-----------+-----------+-----------+-----------+-----------
+UPOS       |     95.21 |     95.21 |     95.21 |     95.21
+~~~~
diff --git a/2018-komp-ling/practicals/segmentation-tokenization/collect_sent.py b/2018-komp-ling/practicals/segmentation-tokenization/collect_sent.py
@@ -0,0 +1,38 @@
+from io import open
+from conllu import parse_incr
+
+def main():
+    print('Extract sentences from the file:')
+    test_file = str(input())
+
+    print('Write sentences to file:')
+    res_file= str(input())
+
+    print('Write gold sentences to file:')
+    gold_file= str(input())
+
+    data = open(test_file, "r", encoding="utf-8")
+    all_sent = []
+    gold_sent = []
+
+    for tokenlist in parse_incr(data):
+        all_sent.append(tuple(dict(tokenlist.metadata).values())[1])
+        sent=[]
+        for token in tokenlist:
+            sent.append(token['form'])
+        gold_sent.append(' '.join(sent))
+
+    res = open(res_file, 'w', encoding='utf-8')
+    for line in all_sent:
+        res.write(line + '\n')
+    res.close()
+    print('{} sentences were extracted and writen to {}'.format(len(all_sent), res_file))
+
+    gd = open(gold_file, 'w', encoding='utf-8')
+    for line in gold_sent:
+        gd.write(line + '\n')
+    gd.close()
+    print('{} sentences were extracted and writen to {}'.format(len(gold_sent), gold_file))
+
+if __name__ == '__main__':
+    main()
diff --git a/2018-komp-ling/practicals/segmentation-tokenization/create_dict.py b/2018-komp-ling/practicals/segmentation-tokenization/create_dict.py
@@ -0,0 +1,23 @@
+from io import open
+from conllu import parse_incr
+
+def main():
+    print('Name of conllu file to parse:')
+    train = str(input())
+    print('Name of dictionary to save:')
+    name = str(input())
+    all_sent=[]
+    data_file = open(train, "r", encoding="utf-8")
+    for tokenlist in parse_incr(data_file):
+        for token in tokenlist:
+            all_sent.append(token['form'])
+    k = list(set(all_sent))
+    k.sort(key = lambda s: len(s), reverse=True)
+    data_file.close()
+    diction = open(name, 'w', encoding='utf-8')
+    for line in k:
+        diction.write(line + '\n')
+    diction.close()
+    print('Dictionary is seccefully created. Number of words is {}'.format(len(k)))
+
+main()
diff --git a/2018-komp-ling/practicals/segmentation-tokenization/evaluation.py b/2018-komp-ling/practicals/segmentation-tokenization/evaluation.py
@@ -0,0 +1,42 @@
+def main():
+    print('Insert file name with original segmentation')
+    f = str(input())
+    print('Insert file name with your segmentation')
+    p = str(input())
+
+    all_sent = open(f, "r", encoding="utf-8").read().splitlines()
+    parsed_s = open(p, "r", encoding="utf-8").read().splitlines()
+
+    tp,tn,fp,fn = (0,0,0,0)
+
+    for g in range(0,577,4):
+        for ind in range(len(all_sent[g:g+4])):
+            i,j=(0,0)
+            first = list(all_sent[ind])
+            second = list(parsed_s[ind])
+            while i<=(len(first)-1):
+                g_s_l = first[i]
+                p_s_l = second[j]
+                punct=',-:。“”【】?、_.「」・/\|%~!'
+                if (g_s_l.isalpha() or g_s_l in punct) and (p_s_l.isalpha() or p_s_l in punct):
+                    tn+=1
+                    i+=1
+                    j+=1
+                if g_s_l==' ' and p_s_l==' ':
+                    tp+=1
+                    i += 1
+                    j += 1
+                if g_s_l==' ' and (p_s_l.isalpha()or p_s_l in punct):
+                    fn+=1
+                    i += 1
+                if (g_s_l.isalpha()or g_s_l in punct) and p_s_l==' ':
+                    j += 1
+                    fp+=1
+    print ('TruePositive: {0}, TrueNegative: {1}, FalsePositive: {2}, FalseNegative: {3}'.format(tp,tn,fp,fn))
+    Acc = (tp+tn)/float(tp+tn+fp+fn)
+    Prec = tp/float(tp+fp)
+    Recall = tp/float(tp+fn)
+    Fscore = 2*Prec*Recall/float(Prec+Recall)
+    print('Accuracy: {}, F-score: {}'.format(round(Acc, 2), round(Fscore, 2)))
+
+main()
diff --git a/2018-komp-ling/practicals/segmentation-tokenization/maxmatch.py b/2018-komp-ling/practicals/segmentation-tokenization/maxmatch.py
@@ -0,0 +1,43 @@
+parsed_sent=[]
+def maxmatch(sentence, dictionary):
+    global parsed_sent
+    if len(sentence) == 0:
+        return 'list is empty'
+    for i in range(len(sentence), -1, -1):
+        firstword = sentence[0:i]
+        remainder = sentence[i:]
+        if firstword in dictionary:
+            parsed_sent.append(firstword)
+            return maxmatch(remainder, dictionary)
+        if i == 1:
+            firstword = sentence[0]
+            remainder = sentence[1:]
+            parsed_sent.append(remainder)
+            parsed_sent.append(firstword)
+
+def main():
+    global parsed_sent
+    print('Insert name of dict:')
+    n_dict=str(input())
+    used_dict = open(n_dict, 'r', encoding='utf-8').read().splitlines()
+
+    print('Sentence to parce:')
+    n_test_sent=str(input())
+    sentences = open(n_test_sent, 'r', encoding='utf-8').read().splitlines()
+
+    print('Save to:')
+    s_file = str(input())
+
+
+    res = []
+    for sent in sentences:
+        maxmatch(sent, used_dict)
+        res.append(parsed_sent)
+        parsed_sent=[]
+
+    save = open(s_file, 'w', encoding='utf-8')
+    for i in res:
+        save.write(' '.join(i)+'\n')
+    save.close()
+
+main()