diff --git a/Makefile b/Makefile index 58cabfd..84fc600 100644 --- a/Makefile +++ b/Makefile @@ -19,7 +19,9 @@ test: malayalam.a python coverage-analysis: malayalam.a python @python tests/coverage-test.py + sort -u tests/unanalyzed.lex --output=tests/unanalyzed.lex + wc tests/unanalyzed.lex dataset: pip install tqdm - python scripts/create-dataset.py \ No newline at end of file + python scripts/create-dataset.py diff --git a/README.md b/README.md index 237b844..8b92113 100644 --- a/README.md +++ b/README.md @@ -126,6 +126,16 @@ The analyser is being developed with lot of tests. To run tests : $ make test ``` +```bash +$ make coverage-test +``` +runs a coverage-tests and creates unanalyzed.lex file in tests with unanalyzed words. +## Dataset +```bash +$ make dataset +``` +creates a .csv file with words from tests/coverage/*.txt files. + ## Citation Please cite the following publication in order to refer to the mlmorph: diff --git a/tests/coverage-test.py b/tests/coverage-test.py index ad8114d..72399c7 100644 --- a/tests/coverage-test.py +++ b/tests/coverage-test.py @@ -29,8 +29,9 @@ def test_total_coverage(self): start = clock() print("%40s\t%8s\t%8s\t%s" % ('File name', 'Words', 'Analysed', 'Percentage')) - for filename in glob.glob(os.path.join(CURR_DIR, "coverage", "*.txt")): - with open(filename, 'r') as file: + with open("./tests/unanalyzed.lex", "w+") as unanFile: + for filename in glob.glob(os.path.join(CURR_DIR, "coverage", "*.txt")): + with open(filename, 'r') as file: tokens_count = 0 analysed_tokens_count = 0 for line in file: @@ -41,12 +42,14 @@ def test_total_coverage(self): analysis = self.analyser.analyse(word, False) if len(analysis) > 0: analysed_tokens_count += 1 + else: + unanFile.write(word+"\n") percentage = (analysed_tokens_count/tokens_count)*100 total_tokens_count += tokens_count total_analysed_tokens_count += analysed_tokens_count print("%40s\t%8d\t%8d\t%3.2f%%" % (os.path.basename( filename), tokens_count, analysed_tokens_count, percentage)) - file.close() + file.close(); percentage = (total_analysed_tokens_count/total_tokens_count)*100 time_taken = clock() - start print('%40s\t%8d\t%8d\t%3.2f%%' %