Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
*.sublime-*
node_modules
lib
2 changes: 2 additions & 0 deletions .npmignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*.sublime-*
node_modules
3 changes: 2 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
language: node_js
node_js:
- 6
- 8
- 8
- 10
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ npm install sentence-tokenizer
Require the module:

```js
var Tokenizer = require('sentence-tokenizer');
var Tokenizer = require('sentence-tokenizer').Tokenizer;
```

Instanciate a tokenizer, with the name of the utterer:
Expand All @@ -27,13 +27,13 @@ var tokenizer = new Tokenizer('Chuck');
Set the entry to work on:

```js
tokenizer.setEntry("This is an entry. Possibly composed of several sentences.");
tokenizer.entry = "This is an entry. Possibly composed of several sentences.";
```

Get the sentences:

```js
console.log(tokenizer.getSentences());
console.log(tokenizer.sentences);
```

Which should produce:
Expand Down
10 changes: 0 additions & 10 deletions lib/tokenizer.d.ts

This file was deleted.

78 changes: 0 additions & 78 deletions lib/tokenizer.js

This file was deleted.

12 changes: 12 additions & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

11 changes: 9 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,12 @@
"scripts": {
"lint": "eslint lib",
"test": "mocha",
"test-w": "mocha -w"
"test:w": "mocha -w",
"build": "tsc",
"build:w": "tsc --watch",
"pretest": "npm run build",
"prepublish": "npm run build",
"postversion": "git push && git push --tags"
},
"homepage": "http://github.com/parmentf/node-sentence-tokenizer",
"repository": {
Expand All @@ -24,7 +29,9 @@
"debug": "4.1.0"
},
"devDependencies": {
"@types/node": "10.12.11",
"eslint": "5.9.0",
"mocha": "5.2.0"
"mocha": "5.2.0",
"typescript": "3.2.1"
}
}
61 changes: 61 additions & 0 deletions src/tokenizer.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
const debug = require('debug')('tokenizer');

const compact = (str: string): string => str.trim().replace(' ', ' ');

export class Tokenizer {
username: string
botname: string
protected _entry: string
protected _sentences: string[]

constructor(username: string = 'Guy', botname: string = 'ECTOR') {
this.username = username
this.botname = botname
this._entry = '';
this._sentences = [];
}

set entry(value: string) {
this._entry = compact(value)
this._sentences = []
}

// Split the entry into sentences.
get sentences(): string[] {
// this.sentences = this.entry.split(/[\.!]\s/);
if (!this._entry) return [];
const words: string[] = this._entry.split(' ');
const endingWords = words.filter((w: string): boolean =>
w.endsWith('.') || w.endsWith('!') || w.endsWith('?')
);

const botnameRegExp = new RegExp("\\W?" + this.botname.normalize() + "\\W?");
const usernameRegExp = new RegExp("\\W?" + this.username.normalize() + "\\W?");
this._sentences = [];
let lastSentence: string = words[0];
words.reduce((prev, cur: string): string => {
const curNormalized: string = cur.normalize();
let curReplaced: string = cur;
if (curNormalized.search(botnameRegExp) !== -1) {
curReplaced = cur.replace(this.botname,"{yourname}");
}
else if (curNormalized.search(usernameRegExp) !== -1) {
curReplaced = cur.replace(this.username,"{myname}");
}

if (endingWords.indexOf(prev) !== -1) {
this._sentences.push(compact(lastSentence));
lastSentence = "";
}
lastSentence = lastSentence + " " + curReplaced;
return cur;
});
this._sentences.push(compact(lastSentence));
return this._sentences;
}

// Get the tokens of one sentence
getTokens(sentenceIndex: number = 0): string[] {
return this._sentences[sentenceIndex].split(' ');
}
}
30 changes: 15 additions & 15 deletions test/test.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
var debug = require('debug')('tokenizer:test');
var assert = require('assert');

var Tokenizer = require('../lib/tokenizer');
var Tokenizer = require('../lib/tokenizer').Tokenizer;

describe('Tokenizer creations', function () {
describe('No botname', function () {
Expand Down Expand Up @@ -35,8 +35,8 @@ describe('Sentences token', function () {
" N'est-ce pas ? " +
" Et avec une URL en plus, c'est mieux: http://google.com." +
" Mais il nous manque encore un mail: gg@gggg.kk";
tokenizer.setEntry(entry);
var sentences = tokenizer.getSentences();
tokenizer.entry = entry;
var sentences = tokenizer.sentences;

it("should get 4 sentences", function () {
assert.equal(sentences.length, 4);
Expand Down Expand Up @@ -71,8 +71,8 @@ describe('Sentences token', function () {
describe('Two sentences', function () {
var entry = "Salut." +
" Hello.";
tokenizer.setEntry(entry);
var sentences = tokenizer.getSentences();
tokenizer.entry = entry;
var sentences = tokenizer.sentences;

it("should get 2 sentences", function () {
assert.equal(sentences.length, 2);
Expand All @@ -83,8 +83,8 @@ describe('Sentences token', function () {
debug('Only one sentence!');
var entry = "Hello.";
var tokenizer2 = new Tokenizer('François');
tokenizer2.setEntry(entry);
var sentences = tokenizer2.getSentences();
tokenizer2.entry = entry;
var sentences = tokenizer2.sentences;

it('should get one sentence', function () {
assert.equal(sentences.length, 1);
Expand All @@ -97,8 +97,8 @@ describe('Sentences token', function () {

describe('Empty sentence', function () {
var entry = " ";
tokenizer.setEntry(entry);
var sentences = tokenizer.getSentences();
tokenizer.entry = entry;
var sentences = tokenizer.sentences;

it('should handle gracefully', function () {
assert.equal(sentences.length, 0);
Expand All @@ -107,8 +107,8 @@ describe('Sentences token', function () {

describe('False end', function () {
var entry = "Bon sang ce n'est pas ça. Bon sang";
tokenizer.setEntry(entry);
var sentences = tokenizer.getSentences();
tokenizer.entry = entry;
var sentences = tokenizer.sentences;

it('should produce only 2 sentences', function () {
assert.equal(sentences.length, 2);
Expand All @@ -117,8 +117,8 @@ describe('Sentences token', function () {

describe('Names', function () {
var entry = "Salut ECTOR. Je m'appelle François.";
tokenizer.setEntry(entry);
var sentences = tokenizer.getSentences();
tokenizer.entry = entry;
var sentences = tokenizer.sentences;

it('botname replaced', function () {
assert.equal(sentences[0], 'Salut {yourname}.');
Expand All @@ -136,8 +136,8 @@ describe('Word tokens', function() {
" Je suis fort aise que tu m'écoutes." +
" Très!!!" +
" Appelle-moi François, si tu veux...";
tokenizer.setEntry(entry);
tokenizer.getSentences();
tokenizer.entry = entry;
var sentences = tokenizer.sentences; // eslint-disable-line no-unused-vars

describe('First sentence', function () {
var tokens = tokenizer.getTokens(0);
Expand Down
15 changes: 15 additions & 0 deletions tsconfig.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"compileOnSave": true,
"compilerOptions": {
"declaration": true,
"module": "commonjs",
"noFallthroughCasesInSwitch": true,
"noImplicitReturns": true,
"outDir": "./lib",
"sourceMap": true,
"target": "es2018",
},
"include": [
"./src/**/*"
]
}