From 29d36c9dabc5d79846e2df6167c2e7616de2e1ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Parmentier?= Date: Sun, 2 Dec 2018 22:06:42 +0100 Subject: [PATCH 1/5] :fire: Git ignore lib directory As it is now generated --- .gitignore | 1 + lib/tokenizer.d.ts | 10 ------ lib/tokenizer.js | 78 ---------------------------------------------- 3 files changed, 1 insertion(+), 88 deletions(-) delete mode 100644 lib/tokenizer.d.ts delete mode 100644 lib/tokenizer.js diff --git a/.gitignore b/.gitignore index d1bc109..2b8fff3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ *.sublime-* node_modules +lib diff --git a/lib/tokenizer.d.ts b/lib/tokenizer.d.ts deleted file mode 100644 index 1cf22d7..0000000 --- a/lib/tokenizer.d.ts +++ /dev/null @@ -1,10 +0,0 @@ -export default class Tokenizer { - username: string; - entry: null | string; - sentences: null | string[]; - botname: string; - constructor(username?: string, botname?: string); - setEntry(entry: string): void; - getSentences(): string[]; - getTokens(sentenceIndex: number): string[]; -} diff --git a/lib/tokenizer.js b/lib/tokenizer.js deleted file mode 100644 index e6e5917..0000000 --- a/lib/tokenizer.js +++ /dev/null @@ -1,78 +0,0 @@ -"use strict"; - -// eslint-disable-next-line no-unused-vars -var debug = require('debug')('tokenizer'); - -function compact(str) { - var res = str.trim(); - res = res.replace(' ', ' '); - return res; -} - -function Tokenizer(username, botname) { - - // // Maybe it is not useful - // if (!(this instanceof Tokenizer)) { - // return new Tokenizer(); - // } - - this.username = username || 'Guy'; - this.entry = null; - this.sentences = null; - - if (typeof botname == 'string') { - this.botname = botname; - } - else { - this.botname = 'ECTOR'; - } -} - -Tokenizer.prototype = { - setEntry : function (entry) { - this.entry = compact(entry); - this.sentences = null; - }, - // Split the entry into sentences. - getSentences : function () { - // this.sentences = this.entry.split(/[\.!]\s/); - if (!this.entry) return []; - var words = this.entry.split(' '); - var endingWords = words.filter(function(w) { - return w.endsWith('.') || w.endsWith('!') || w.endsWith('?'); - }); - - var self = this; - var botnameRegExp = new RegExp("\\W?" + self.botname.normalize() + "\\W?"); - var usernameRegExp = new RegExp("\\W?" + self.username.normalize() + "\\W?"); - var lastSentence = words[0]; - self.sentences = []; - words.reduce(function (prev, cur) { - var curNormalized = cur.normalize(); - var curReplaced = cur; - if (curNormalized.search(botnameRegExp) !== -1) { - curReplaced = cur.replace(self.botname,"{yourname}"); - } - else if (curNormalized.search(usernameRegExp) !== -1) { - curReplaced = cur.replace(self.username,"{myname}"); - } - - if (endingWords.indexOf(prev) != -1) { - self.sentences.push(compact(lastSentence)); - lastSentence = ""; - } - lastSentence = lastSentence + " " + curReplaced; - return cur; - }); - self.sentences.push(compact(lastSentence)); - return this.sentences; - }, - // Get the tokens of one sentence - getTokens : function (sentenceIndex) { - var s = 0; - if(typeof sentenceIndex === 'number') s = sentenceIndex; - return this.sentences[s].split(' '); - } -}; - -module.exports = Tokenizer; \ No newline at end of file From 20aedf7f15ea21da3884ef53b60d5d232fcad08d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Parmentier?= Date: Sun, 2 Dec 2018 22:07:19 +0100 Subject: [PATCH 2/5] :heavy_plus_sign: Add typescript --- package-lock.json | 12 ++++++++++++ package.json | 11 +++++++++-- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/package-lock.json b/package-lock.json index 7cc4938..d9832a9 100644 --- a/package-lock.json +++ b/package-lock.json @@ -24,6 +24,12 @@ "js-tokens": "^4.0.0" } }, + "@types/node": { + "version": "10.12.11", + "resolved": "https://registry.npmjs.org/@types/node/-/node-10.12.11.tgz", + "integrity": "sha512-3iIOhNiPGTdcUNVCv9e5G7GotfvJJe2pc9w2UgDXlUwnxSZ3RgcUocIU+xYm+rTU54jIKih998QE4dMOyMN1NQ==", + "dev": true + }, "acorn": { "version": "6.0.4", "resolved": "https://registry.npmjs.org/acorn/-/acorn-6.0.4.tgz", @@ -1084,6 +1090,12 @@ "prelude-ls": "~1.1.2" } }, + "typescript": { + "version": "3.2.1", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-3.2.1.tgz", + "integrity": "sha512-jw7P2z/h6aPT4AENXDGjcfHTu5CSqzsbZc6YlUIebTyBAq8XaKp78x7VcSh30xwSCcsu5irZkYZUSFP1MrAMbg==", + "dev": true + }, "uri-js": { "version": "4.2.2", "resolved": "https://registry.npmjs.org/uri-js/-/uri-js-4.2.2.tgz", diff --git a/package.json b/package.json index d1ec3d5..4e5e7ac 100644 --- a/package.json +++ b/package.json @@ -6,7 +6,12 @@ "scripts": { "lint": "eslint lib", "test": "mocha", - "test-w": "mocha -w" + "test:w": "mocha -w", + "build": "tsc", + "build:w": "tsc --watch", + "pretest": "npm run build", + "prepublish": "npm run build", + "postversion": "git push && git push --tags" }, "homepage": "http://github.com/parmentf/node-sentence-tokenizer", "repository": { @@ -24,7 +29,9 @@ "debug": "4.1.0" }, "devDependencies": { + "@types/node": "10.12.11", "eslint": "5.9.0", - "mocha": "5.2.0" + "mocha": "5.2.0", + "typescript": "3.2.1" } } From 14633964618d9c80cfd6438e09c29b4cb6cf56c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Parmentier?= Date: Sun, 2 Dec 2018 22:10:01 +0100 Subject: [PATCH 3/5] :hammer: Rewrite in TypeScript --- README.md | 6 ++--- src/tokenizer.ts | 61 ++++++++++++++++++++++++++++++++++++++++++++++++ test/test.js | 30 ++++++++++++------------ tsconfig.json | 15 ++++++++++++ 4 files changed, 94 insertions(+), 18 deletions(-) create mode 100644 src/tokenizer.ts create mode 100644 tsconfig.json diff --git a/README.md b/README.md index b2f5a4d..8ecade2 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ npm install sentence-tokenizer Require the module: ```js -var Tokenizer = require('sentence-tokenizer'); +var Tokenizer = require('sentence-tokenizer').Tokenizer; ``` Instanciate a tokenizer, with the name of the utterer: @@ -27,13 +27,13 @@ var tokenizer = new Tokenizer('Chuck'); Set the entry to work on: ```js -tokenizer.setEntry("This is an entry. Possibly composed of several sentences."); +tokenizer.entry = "This is an entry. Possibly composed of several sentences."; ``` Get the sentences: ```js -console.log(tokenizer.getSentences()); +console.log(tokenizer.sentences); ``` Which should produce: diff --git a/src/tokenizer.ts b/src/tokenizer.ts new file mode 100644 index 0000000..d020169 --- /dev/null +++ b/src/tokenizer.ts @@ -0,0 +1,61 @@ +const debug = require('debug')('tokenizer'); + +const compact = (str: string): string => str.trim().replace(' ', ' '); + +export class Tokenizer { + username: string + botname: string + protected _entry: string + protected _sentences: string[] + + constructor(username: string = 'Guy', botname: string = 'ECTOR') { + this.username = username + this.botname = botname + this._entry = ''; + this._sentences = []; + } + + set entry(value: string) { + this._entry = compact(value) + this._sentences = [] + } + + // Split the entry into sentences. + get sentences(): string[] { + // this.sentences = this.entry.split(/[\.!]\s/); + if (!this._entry) return []; + const words: string[] = this._entry.split(' '); + const endingWords = words.filter((w: string): boolean => + w.endsWith('.') || w.endsWith('!') || w.endsWith('?') + ); + + const botnameRegExp = new RegExp("\\W?" + this.botname.normalize() + "\\W?"); + const usernameRegExp = new RegExp("\\W?" + this.username.normalize() + "\\W?"); + this._sentences = []; + let lastSentence: string = words[0]; + words.reduce((prev, cur: string): string => { + const curNormalized: string = cur.normalize(); + let curReplaced: string = cur; + if (curNormalized.search(botnameRegExp) !== -1) { + curReplaced = cur.replace(this.botname,"{yourname}"); + } + else if (curNormalized.search(usernameRegExp) !== -1) { + curReplaced = cur.replace(this.username,"{myname}"); + } + + if (endingWords.indexOf(prev) !== -1) { + this._sentences.push(compact(lastSentence)); + lastSentence = ""; + } + lastSentence = lastSentence + " " + curReplaced; + return cur; + }); + this._sentences.push(compact(lastSentence)); + return this._sentences; + } + + // Get the tokens of one sentence + getTokens(sentenceIndex: number = 0): string[] { + return this._sentences[sentenceIndex].split(' '); + } +} diff --git a/test/test.js b/test/test.js index 9bd6b9d..559af64 100644 --- a/test/test.js +++ b/test/test.js @@ -5,7 +5,7 @@ var debug = require('debug')('tokenizer:test'); var assert = require('assert'); -var Tokenizer = require('../lib/tokenizer'); +var Tokenizer = require('../lib/tokenizer').Tokenizer; describe('Tokenizer creations', function () { describe('No botname', function () { @@ -35,8 +35,8 @@ describe('Sentences token', function () { " N'est-ce pas ? " + " Et avec une URL en plus, c'est mieux: http://google.com." + " Mais il nous manque encore un mail: gg@gggg.kk"; - tokenizer.setEntry(entry); - var sentences = tokenizer.getSentences(); + tokenizer.entry = entry; + var sentences = tokenizer.sentences; it("should get 4 sentences", function () { assert.equal(sentences.length, 4); @@ -71,8 +71,8 @@ describe('Sentences token', function () { describe('Two sentences', function () { var entry = "Salut." + " Hello."; - tokenizer.setEntry(entry); - var sentences = tokenizer.getSentences(); + tokenizer.entry = entry; + var sentences = tokenizer.sentences; it("should get 2 sentences", function () { assert.equal(sentences.length, 2); @@ -83,8 +83,8 @@ describe('Sentences token', function () { debug('Only one sentence!'); var entry = "Hello."; var tokenizer2 = new Tokenizer('François'); - tokenizer2.setEntry(entry); - var sentences = tokenizer2.getSentences(); + tokenizer2.entry = entry; + var sentences = tokenizer2.sentences; it('should get one sentence', function () { assert.equal(sentences.length, 1); @@ -97,8 +97,8 @@ describe('Sentences token', function () { describe('Empty sentence', function () { var entry = " "; - tokenizer.setEntry(entry); - var sentences = tokenizer.getSentences(); + tokenizer.entry = entry; + var sentences = tokenizer.sentences; it('should handle gracefully', function () { assert.equal(sentences.length, 0); @@ -107,8 +107,8 @@ describe('Sentences token', function () { describe('False end', function () { var entry = "Bon sang ce n'est pas ça. Bon sang"; - tokenizer.setEntry(entry); - var sentences = tokenizer.getSentences(); + tokenizer.entry = entry; + var sentences = tokenizer.sentences; it('should produce only 2 sentences', function () { assert.equal(sentences.length, 2); @@ -117,8 +117,8 @@ describe('Sentences token', function () { describe('Names', function () { var entry = "Salut ECTOR. Je m'appelle François."; - tokenizer.setEntry(entry); - var sentences = tokenizer.getSentences(); + tokenizer.entry = entry; + var sentences = tokenizer.sentences; it('botname replaced', function () { assert.equal(sentences[0], 'Salut {yourname}.'); @@ -136,8 +136,8 @@ describe('Word tokens', function() { " Je suis fort aise que tu m'écoutes." + " Très!!!" + " Appelle-moi François, si tu veux..."; - tokenizer.setEntry(entry); - tokenizer.getSentences(); + tokenizer.entry = entry; + var sentences = tokenizer.sentences; // eslint-disable-line no-unused-vars describe('First sentence', function () { var tokens = tokenizer.getTokens(0); diff --git a/tsconfig.json b/tsconfig.json new file mode 100644 index 0000000..e066bbb --- /dev/null +++ b/tsconfig.json @@ -0,0 +1,15 @@ +{ + "compileOnSave": true, + "compilerOptions": { + "declaration": true, + "module": "commonjs", + "noFallthroughCasesInSwitch": true, + "noImplicitReturns": true, + "outDir": "./lib", + "sourceMap": true, + "target": "es2018", + }, + "include": [ + "./src/**/*" + ] +} From c9c1ca80318d18c30137b8208156ef1754eaac1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Parmentier?= Date: Sun, 2 Dec 2018 22:17:25 +0100 Subject: [PATCH 4/5] :green_heart: Test also in node 10 --- .travis.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 27ef423..1d7de13 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,4 +1,5 @@ language: node_js node_js: - 6 - - 8 \ No newline at end of file + - 8 + - 10 \ No newline at end of file From e4c0c5ca2ae6b47fd334457b30e91c221a8aedbb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Parmentier?= Date: Sun, 2 Dec 2018 23:01:57 +0100 Subject: [PATCH 5/5] :bug: NPM should not ignore lib! --- .npmignore | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 .npmignore diff --git a/.npmignore b/.npmignore new file mode 100644 index 0000000..d1bc109 --- /dev/null +++ b/.npmignore @@ -0,0 +1,2 @@ +*.sublime-* +node_modules