diff --git a/.gitignore b/.gitignore
index 894a44c..24e83e0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -102,3 +102,11 @@ venv.bak/
# mypy
.mypy_cache/
+
+# IDE folders
+.idea/
+.vscode/
+
+# Dependencies
+DrQA/
+stanford-corenlp-full-*
diff --git a/.idea/.gitignore b/.idea/.gitignore
deleted file mode 100644
index 0e40fe8..0000000
--- a/.idea/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-
-# Default ignored files
-/workspace.xml
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
deleted file mode 100644
index 105ce2d..0000000
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/macaw.iml b/.idea/macaw.iml
deleted file mode 100644
index 055624e..0000000
--- a/.idea/macaw.iml
+++ /dev/null
@@ -1,17 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
deleted file mode 100644
index d1151ac..0000000
--- a/.idea/misc.xml
+++ /dev/null
@@ -1,7 +0,0 @@
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
deleted file mode 100644
index 7021aec..0000000
--- a/.idea/modules.xml
+++ /dev/null
@@ -1,8 +0,0 @@
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
deleted file mode 100644
index 94a25f7..0000000
--- a/.idea/vcs.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-
-
-
-
-
-
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..b227825
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,78 @@
+FROM ubuntu:20.04
+
+WORKDIR /usr/src/app
+
+# Disable interactive input. This is needed to install mongodb.
+ENV DEBIAN_FRONTEND noninteractive
+
+RUN apt-get update && apt-get install -y \
+ gnupg \
+ wget \
+ g++ \
+ make \
+ zlib1g-dev \
+ software-properties-common \
+ unzip \
+ default-jre \
+ git \
+ apt-transport-https ca-certificates \
+ && rm -rf /var/lib/apt/lists/*
+
+RUN apt-get update && apt-get install -y python3-pip
+
+RUN apt update && apt install -y ffmpeg
+
+RUN pip3 install --upgrade pip && pip3 install torch
+
+# Install all dependencies mentioned in the macaw requirements document.
+COPY requirements.txt requirements.txt
+RUN pip3 install -r requirements.txt
+
+# Download Stanford core NLP data if user has not specified a local volume. This is a 400MB compressed file.
+ARG download_stanford_corenlp=false
+RUN if $download_stanford_corenlp ; then \
+ wget -O "stanford-corenlp-full-2017-06-09.zip" "http://nlp.stanford.edu/software/stanford-corenlp-full-2017-06-09.zip" \
+ && unzip "stanford-corenlp-full-2017-06-09.zip" \
+ && rm "stanford-corenlp-full-2017-06-09.zip" ; fi
+
+# Download and install FAIR DrQA module.
+RUN git clone https://github.com/facebookresearch/DrQA.git
+RUN cd DrQA \
+ && pip3 install -r requirements.txt \
+ && python3 setup.py develop
+
+# Download a pre-trained DrQA model if user has not specified a local volume. This is a 7.5GB compressed file download
+# and requires 25GB of uncompressed space. To save Docker image memory, it is recommended to use external file as volume.
+ARG download_drqa_model=false
+RUN if $download_drqa_model ; then cd DrQA && ./download.sh ; fi
+
+# Install MongoDB server https://docs.mongodb.com/manual/tutorial/install-mongodb-on-ubuntu/
+RUN wget -qO - https://www.mongodb.org/static/pgp/server-5.0.asc | apt-key add -
+RUN echo "deb [ arch=amd64,arm64 ] https://repo.mongodb.org/apt/ubuntu focal/mongodb-org/5.0 multiverse" | tee /etc/apt/sources.list.d/mongodb-org-5.0.list
+RUN apt-get update && apt-get install -y mongodb-org
+
+# Create the MongoDB data directory.
+RUN mkdir -p /data/db
+
+# Copy directory that contains trectext documents needed for Indri retriever.
+COPY trec_documents trec_documents
+
+# Copy files and directories from workspace to Docker container.
+COPY macaw macaw
+COPY scripts scripts
+COPY setup.py setup.py
+
+ENV PYTHONPATH="$PYTHONPATH:/usr/src/app"
+
+# Install Macaw.
+RUN python3 setup.py install
+
+# To fix async keyword issue in python3.7+ https://github.com/pexpect/pexpect/issues/453
+RUN pip3 install -Iv pexpect==4.8.0
+
+# Create index
+RUN mkdir tantivy_index/
+RUN python3 macaw/build_tantivy_index.py --index_path tantivy_index/ --document_path trec_documents/
+
+# Run the script that will start MongoDB and run python application.
+CMD ["/bin/bash", "scripts/start.sh"]
diff --git a/README.md b/README.md
index c6e6d06..add050d 100644
--- a/README.md
+++ b/README.md
@@ -1,30 +1,33 @@
# Macaw: An Extensible Conversational Information Seeking Platform
+
Conversational information seeking (CIS) has been recognized as a major emerging research area in information retrieval.
-Such research will require data and tools, to allow the implementation and study of conversational systems. Macaw is
-an open-source framework with a modular architecture for CIS research. Macaw supports *multi-turn*, *multi-modal*, and
-*mixed-initiative* interactions, for tasks such as document retrieval, question answering, recommendation, and
-structured data exploration. It has a modular design to encourage the study of new CIS algorithms, which can be
-evaluated in batch mode. It can also integrate with a user interface, which allows user studies and data collection in
-an interactive mode, where the back end can be *fully algorithmic* or a *wizard of oz* setup.
-
-Macaw could be of interest to the researchers and practitioners working on information retrieval, natural language
+Such research will require data and tools, to allow the implementation and study of conversational systems. Macaw is an
+open-source framework with a modular architecture for CIS research. Macaw supports _multi-turn_, _multi-modal_, and
+_mixed-initiative_ interactions, for tasks such as document retrieval, question answering, recommendation, and
+structured data exploration. It has a modular design to encourage the study of new CIS algorithms, which can be
+evaluated in batch mode. It can also integrate with a user interface, which allows user studies and data collection in
+an interactive mode, where the back end can be _fully algorithmic_ or a _wizard of oz_ setup.
+
+Macaw could be of interest to the researchers and practitioners working on information retrieval, natural language
processing, and dialogue systems.
For more information on Macaw, please refer to [this paper](https://arxiv.org/pdf/1912.08904.pdf).
Table of content:
-+ [Macaw Architecture](#macaw-architecture)
- + [Interfaces](#interfaces)
- + [Retrieval](#retrieval)
- + [Answer Selection and Generation](#answer-selection-and-generation)
-+ [Installation](#installation)
-+ [Running Macaw](#running-macaw)
-+ [Bug Report and Feature Request](#bug-report-and-feature-request)
-+ [Citation](#citation)
-+ [License](#license)
-+ [Contribution](#contribution)
+
+- [Macaw Architecture](#macaw-architecture)
+ - [Interfaces](#interfaces)
+ - [Retrieval](#retrieval)
+ - [Answer Selection and Generation](#answer-selection-and-generation)
+- [Installation](#installation)
+- [Running Macaw](#running-macaw)
+- [Bug Report and Feature Request](#bug-report-and-feature-request)
+- [Citation](#citation)
+- [License](#license)
+- [Contribution](#contribution)
## Macaw Architecture
+
Macaw has a modular architecture, which allows further development and extension. The high-level architecture of Macaw
is presented below:
@@ -33,88 +36,212 @@ is presented below:
For more information on each module in Macaw, refer to this paper.
#### Interfaces
+
Macaw supports the following interfaces:
-+ Standard IO: For *development* purposes
-+ File IO: For *batch experiments* (see the examples in the `data` folder for input and output file formats)
-+ Telegram bot: For interaction with real users
+
+- Standard IO: For _development_ purposes
+- File IO: For _batch experiments_ (see the examples in the `data` folder for input and output file formats)
+- Telegram bot: For interaction with real users
Here is an example of the Telegram interface for Macaw. It supports multi-modal interactions (text, speech, click, etc).
-
+

-
#### Retrieval
+
Macaw features the following search engines:
-+ [Indri](http://lemurproject.org/indri.php): an open-source search engine that can be used for any arbitrary text
-collection.
-+ Bing web search API: sending a request to the Bing API and getting the results.
+
+- [Tantivy](https://github.com/quickwit-oss/tantivy): Tantivy is a full-text search engine library written in Rust.
+- Bing web search API: sending a request to the Bing API and getting the results.
#### Answer Selection and Generation
-For question answering, Macaw only features [the DrQA model](https://github.com/facebookresearch/DrQA) in its current
+
+For question answering, Macaw only features [the DrQA model](https://github.com/facebookresearch/DrQA) in its current
version.
+## Installation and running with Docker
-## Installation
-Macaw requires `Python >= 3.6` and `pip3`. If you don't have `setuptools`, run `sudo pip3 install setuptools`.
-To install Macaw, first **clone macaw** from this repo and then follow the following installation steps. The
-mentioned installation commands can be executed on Ubuntu. You can use the same or similar commands on other Linux
-distribution. If you are using Windows 10, we recommend installing Macaw and all the required packages on
-[Windows Subsystem for Linux](https://docs.microsoft.com/en-us/windows/wsl/install-win10).
+The package has been tested with certain dependencies and it is much easier to reproduce it in a similar environment. It
+has been integrated with Docker to make it compatible with all operating systems. The default Docker setup runs the
+application using the Standard IO interface, uses Tantivy for document retrieval, and DrQA for MRC (answer selection). To
+run using other settings, appropriate changes should be done.
-#### Step 1: Installing MongoDB server
-Macaw uses MongoDB for storing and retrieving user interactions (conversations). To install MongoDB server, run the
-following command:
+The first step is to install [Docker](https://docs.docker.com/engine/install/) in your system. Then continue with the
+below steps.
+
+### Create the build
+
+To reduce the size of the build, we can keep certain data outside the Docker container and mount it
+using [volumes](https://docs.docker.com/storage/volumes/).
+
+1. Download the Stanford Core NLP data
+ from [here](http://nlp.stanford.edu/software/stanford-corenlp-full-2017-06-09.zip) and put the
+ directory `stanford-corenlp-full-2017-06-09` in your project root directory.
+1. Install [DrQA](https://github.com/facebookresearch/DrQA) in a separate workspace and download the pre-trained model.
+ It stores the models in `data/reader/` directory. We will use the downloaded _multitask.mdl_ model.
+
+Once you have the two downloads done, run the below command from project root to create a docker build with name _macaw_:
+
+```commandline
+docker build -t macaw .
```
-sudo apt-get install mongodb-server-core
+
+If you don't want to pre-install DrQA model and Stanford Core NLP data, create the build using the below command. It
+will install both dependencies for you and keep them as part of the build. Note that this will significantly increase
+the build size (by ~400MB for Stanford CoreNLP and by ~7.5GB for DrQA).
+
+```commandline
+docker build --build-arg download_stanford_corenlp=true --build-arg download_drqa_model=true -t macaw .
+```
+
+_Note: To make sure that the Docker container builds without modification, an x86_64/amd64 system is required. If you have an arm64 device, then add the flag `--platform linux/amd64` to the build command._
+
+### Run the application
+
+If you downloaded certain data locally, then use Docker volumes to mount local directory to Docker container. You need
+to provide the local directory path during runtime. Run the command from project root.
+
+```commandline
+docker run --rm -i --name=macaw_test_container \
+-v :/usr/src/app/DrQA/data \
+-v $("pwd")/stanford-corenlp-full-2017-06-09:/usr/src/app/stanford-corenlp-full-2017-06-09 \
+macaw
+```
+
+`` could be `/Users/amitgh/PycharmProjects/DrQA/data` if you downloaded the pre-trained model in a
+separate workspace named DrQA.
+
+If you did not separately download data at build time, simply run:
+
+```commandline
+docker run --rm -i --name=macaw_test_container macaw
+```
+
+In above command we start a container with name _macaw_test_container_ from build image _macaw_ in interactive
+mode (`-i`)
+and remove the container when the application exits (`--rm`). After installing all dependencies, it
+runs `scripts/start.sh`
+which first starts MongoDB server in a separate thread and then runs `live_main.py`.
+
+_Note: Similar to requiring the additional flag of `--platform linux/amd64` to build the Docker container with an arm64 machine, running said container also requires the same flag.
+:warning: **The performance of the container under this emulation will be incredibly poor. If possible, use a x86_64/amd64 system**._
+
+#### Run with file input
+
+To avoid typing the input every time, you can provide an input file and get output in an output file. We need to mount
+the directory containing the data.
+
+```commandline
+docker build -t macaw . && docker run --rm -i --name=macaw_test_container \
+-v :/usr/src/app/DrQA/data \
+-v $("pwd")/stanford-corenlp-full-2017-06-09:/usr/src/app/stanford-corenlp-full-2017-06-09 \
+-v $("pwd")/data:/usr/src/app/data \
+macaw
+```
+
+Also update the command inside `scripts/start.sh` file to
+```commandline
+python3 macaw/live_main.py --mode exp --interface fileio
+```
+
+### ssh into the container
+
+While the application is running, we can go inside the container to see the contents (directory structure, Tantivy index,
+etc.).
+
+```commandline
+docker exec -it macaw_test_container /bin/bash
+```
+
+### Updating TREC data for Tantivy
+
+Tantivy index is created using the document stored in `trec_documents/` directory. It has some default data. To create a
+bigger index, download the entire data from [archive](https://archive.org/details/trec-ir) and put it in trec_documents.
+Docker will copy it during build time and create a new index.
+
+## Running entire Macuna application
+
+Using `docker compose` we can start the main application and all other supporting docker containers (nlp pipeline
+applications and remote modules) at once. This does not work with stdio mode as docker compose does not support
+terminal input. Run the below command.
+
+```commandline
+docker compose build && docker compose up
```
-#### Step 2: Installing Indri and Pyndri
-[Indri](http://lemurproject.org/indri.php) is an open-source search engine for information retrieval research,
-implemented as part of the [Lemur Project](http://lemurproject.org/).
-[Pyndri](https://github.com/cvangysel/pyndri) is a python interface to Indri. Macaw uses Indri for retrieving documents
-from an arbitrary text collection.
-To install Indri, first download Indri from https://sourceforge.net/projects/lemur/files/lemur/. As suggested by pyndri,
-we have used Indri-5.11. This Indri version can be installed as follows:
+To run different containers independently or to support terminal input, run the below commands in order.
+
+First, build the application.
+
+```commandline
+docker compose build
```
-# download indri-5.11.tar.gz
-sudo apt install g++ zlib1g-dev
-tar xzvf indri-5.11.tar.gz
-rm indri-5.11.tar.gz
-cd indri-5.11
-./configure CXX="g++ -D_GLIBCXX_USE_CXX11_ABI=0"
-make
-sudo make install
+
+Second, start all the supporting remote modules. Make sure to explicitly provide port and container names. This can be
+found from the `docker-compose.yml` file.
+
+```commandline
+docker compose run --rm -p "127.0.0.1:8001:80" --name nlp-pipeline-app-flask nlp-pipeline-app-flask
+docker compose run --rm -p "127.0.0.1:8002:80" --name nlp-pipeline-app-ic nlp-pipeline-app-ic
+docker compose run --rm -p "127.0.0.1:8003:80" --name response-generator-app-qa response-generator-app-qa
+```
+
+Third, run the main application which has stdio (or it can also have fileio). For stdio update the flags in `start.sh`
+with `--mode live --interface stdio`.
+
+```commandline
+docker compose run --rm base-app
```
-Then, clone the pyndri repository from https://github.com/cvangysel/pyndri and run the following command:
+## Local Setup
+
+To setup the package locally without using Docker, follow the below instructions.
+
+### Installation
+
+Macaw requires `Python >= 3.5` and `pip3`. If you don't have `setuptools`, run `sudo pip3 install setuptools`. To
+install Macaw, first **clone macaw** from this repo and then follow the following installation steps. The mentioned
+installation commands can be executed on Ubuntu. You can use the same or similar commands on other Linux distribution.
+If you are using Windows 10, we recommend installing Macaw and all the required packages on
+[Windows Subsystem for Linux](https://docs.microsoft.com/en-us/windows/wsl/install-win10).
+
+#### Step 1: Installing MongoDB server
+
+Macaw uses MongoDB for storing and retrieving user interactions (conversations). To install MongoDB server, run the
+following command:
+
```
-python3 setup.py install
+sudo apt-get install mongodb-server-core
```
At this step, you can make sure your installation is complete by running the pyndri tests.
-#### Step 3: Installing Stanford Core NLP
-Stanford Core NLP can be used for tokenization and most importantly for co-reference resolution. If you do not need
-co-reference resolution, you can ignore this step. Stanford Core NLP requires `java`. Get it by following these
+#### Step 2: Installing Stanford Core NLP
+
+Stanford Core NLP can be used for tokenization and most importantly for co-reference resolution. If you do not need
+co-reference resolution, you can ignore this step. Stanford Core NLP requires `java`. Get it by following these
commands:
+
```
wget -O "stanford-corenlp-full-2017-06-09.zip" "http://nlp.stanford.edu/software/stanford-corenlp-full-2017-06-09.zip"
sudo apt-get install unzip
unzip "stanford-corenlp-full-2017-06-09.zip"
rm "stanford-corenlp-full-2017-06-09.zip"
-```
+```
If you don't have `java`, install it using:
+
```
sudo apt-get install default-jre
```
-#### Step 4: Installing DrQA
-Macaw also supports answer extraction / generation for user queries from retrieved documents. For this purpose, it
+#### Step 3: Installing DrQA
+
+Macaw also supports answer extraction / generation for user queries from retrieved documents. For this purpose, it
features [DrQA](https://github.com/facebookresearch/DrQA). If you do not need this functionality, ignore this step (you
-can also install this later).
-To install DrQA, run the following commands:
+can also install this later). To install DrQA, run the following commands:
+
```
git clone https://github.com/facebookresearch/DrQA.git
cd DrQA
@@ -123,47 +250,54 @@ pip3 install torch
sudo python3 setup.py develop
```
-To use pre-trained DrQA model, use the following command.
+To use pre-trained DrQA model, use the following command.
+
```
./download.sh
```
+
This downloads a 7.5GB (compressed) file and requires 25GB (uncompressed) space. This may take a while!
-
+#### Step 4: Installing FFmpeg
+
+To support speech interactions with users, Macaw requires FFmpeg for some multimedia processing steps. If you don't need
+a speech support from Macaw, you can skip this step. To install FFmpeg, run the following command:
-#### Step 5: Installing FFmpeg
-To support speech interactions with users, Macaw requires FFmpeg for some multimedia processing steps. If you don't
-need a speech support from Macaw, you can skip this step. To install FFmpeg, run the following command:
```
-sudo apt-get install
+sudo apt-get install
```
-#### Step 6: Installing Macaw
+#### Step 5: Installing Macaw
+
After cloning Macaw, use the following commands for installation:
+
```
cd macaw
sudo pip3 install -r requirements.txt
sudo python3 setup.py install
```
-## Running Macaw
+### Running Macaw
+
If you run macaw with interactive (or live) mode, you should first run MongoDB server using the following command:
+
```
sudo mongod
```
-Note that this command uses the default database directory (`/data/db`) for storing the data. You may need to create
-this directory if you haven't. You can also use other locations using the `--dbpath` argument.
+Note that this command uses the default database directory (`/data/db`) for storing the data. You may need to create
+this directory if you haven't. You can also use other locations using the `--dbpath` argument.
We provide three different main scripts (i.e., app):
-+ `live_main.py`: An interactive conversational search and question answering system. It can use both STDIO and Telegram
-interfaces.
-+ `batch_ext_main.py`: A model for running experiments on a reusable dataset. This main script uses FILEIO as the
-interface.
-+ `wizard_of_oz_main.py`: A main script for Wizard of Oz experiments.
-
+
+- `live_main.py`: An interactive conversational search and question answering system. It can use both STDIO and Telegram
+ interfaces.
+- `batch_ext_main.py`: A model for running experiments on a reusable dataset. This main script uses FILEIO as the
+ interface.
+- `wizard_of_oz_main.py`: A main script for Wizard of Oz experiments.
+
After selecting the desired main script, open the python file and provide the required parameters. For example, you need
-to use your Bing subscription key (if using Bing), the path to Indri index (if using Indri), Telegram bot token (if
+to use your Bing subscription key (if using Bing), the path to Tantivy index, Telegram bot token (if
using Telegram interface), etc. in order to run the `live_main.py` script. You can further run the favorite main script
as below:
@@ -171,18 +305,21 @@ as below:
python3 live_main.py
```
-
## Bug Report and Feature Request
-For bug report and feature request, you can open an issue in github, or send an email to
+
+For bug report and feature request, you can open an issue in github, or send an email to
[Hamed Zamani](http://hamedz.ir) at `hazamani@microsoft.com`.
## Citation
+
If you found Macaw useful, you can cite the following article:
+
```
Hamed Zamani and Nick Craswell, "Macaw: An Extensible Conversational Information Seeking System", arxiv pre-print.
```
bibtex:
+
```
@article{macaw,
title={Macaw: An Extensible Conversational Information Seeking Platform},
@@ -193,18 +330,18 @@ bibtex:
```
## License
-Macaw is distributed under the **MIT License**. See the `LICENSE` file for more information.
+Macaw is distributed under the **MIT License**. See the `LICENSE` file for more information.
## Contribution
-This project welcomes contributions and suggestions. Most contributions require you to agree to a
-Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
-the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
+This project welcomes contributions and suggestions. Most contributions require you to agree to a Contributor License
+Agreement (CLA) declaring that you have the right to, and actually do, grant us the rights to use your contribution. For
+details, visit https://cla.opensource.microsoft.com.
-When you submit a pull request, a CLA bot will automatically determine whether you need to provide
-a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
-provided by the bot. You will only need to do this once across all repos using our CLA.
+When you submit a pull request, a CLA bot will automatically determine whether you need to provide a CLA and decorate
+the PR appropriately (e.g., status check, comment). Simply follow the instructions provided by the bot. You will only
+need to do this once across all repos using our CLA.
This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
diff --git a/data/file_input.txt b/data/file_input.txt
new file mode 100644
index 0000000..5a3ab77
--- /dev/null
+++ b/data/file_input.txt
@@ -0,0 +1,4 @@
+q1 who is the president of Japan?
+q2 when did the gulf war start?
+q3 what rights did the kingdom restore with israel?
+q4 when did jordan regained sovereignty over its territories?
\ No newline at end of file
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..db68cec
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,34 @@
+version: '3.4'
+services:
+ base-app:
+ build: ./
+ ports:
+ - "127.0.0.1:8000:80"
+ volumes:
+ - "/Users/amitgh/PycharmProjects/DrQA/data:/usr/src/app/DrQA/data"
+ - "/Users/amitgh/PycharmProjects/Maruna/macaw/stanford-corenlp-full-2017-06-09:/usr/src/app/stanford-corenlp-full-2017-06-09"
+ - "/Users/amitgh/PycharmProjects/Maruna/macaw/data:/usr/src/app/data"
+ deploy:
+ resources:
+ limits:
+ memory: 5gb
+ nlp-pipeline-app-flask:
+ build: ./macaw/docker/flask_app
+ ports:
+ - "127.0.0.1:8001:80"
+ deploy:
+ resources:
+ limits:
+ memory: 100mb
+ nlp-pipeline-app-ic:
+ build: ./macaw/docker/ic_app
+ ports:
+ - "127.0.0.1:8002:80"
+ response-generator-app-qa:
+ build: ./macaw/docker/qa_app
+ ports:
+ - "127.0.0.1:8003:80"
+ deploy:
+ resources:
+ limits:
+ memory: 100mb
diff --git a/macaw/batch_exp_main.py b/macaw/batch_exp_main.py
index 9e61b96..8cad6b3 100644
--- a/macaw/batch_exp_main.py
+++ b/macaw/batch_exp_main.py
@@ -6,7 +6,7 @@
from macaw.cis import CIS
from macaw.core import mrc, retrieval
-from macaw.core.input_handler.action_detection import RequestDispatcher
+from macaw.core.response.action_detection import RequestDispatcher
from macaw.core.output_handler import naive_output_selection
diff --git a/macaw/build_tantivy_index.py b/macaw/build_tantivy_index.py
new file mode 100644
index 0000000..e584c7f
--- /dev/null
+++ b/macaw/build_tantivy_index.py
@@ -0,0 +1,55 @@
+import argparse
+import os
+from typing import List
+
+import tantivy
+
+from macaw.core.retrieval.doc import get_trec_doc
+
+
+def get_trec_docs(documents_path: str) -> List[str]:
+ # can be optimized
+ doc_list = []
+ raw_body_list = []
+ for trec_files in os.listdir(documents_path):
+ if not trec_files.startswith('.'):
+ with open(os.path.join(documents_path, trec_files), 'r', encoding="utf-8") as fobj:
+ trec_txt = fobj.read()
+ raw_body_list.append(trec_txt)
+ doc_list.append(get_trec_doc(trec_txt))
+ return doc_list, raw_body_list
+
+
+def main(index_path, documents_path):
+ # build the schema
+ schema_builder = tantivy.SchemaBuilder()
+ schema_builder.add_text_field("body", stored=True)
+ schema_builder.add_unsigned_field("doc_id", stored=True)
+ schema = schema_builder.build()
+ # create index
+ index = tantivy.Index(schema, path=index_path)
+ # read all trec doc
+ documents, raw_docs = get_trec_docs(documents_path)
+ # add documents
+ print('Building sparse index of {} docs...'.format(len(documents)))
+ writer = index.writer()
+ for i, doc in enumerate(raw_docs):
+ writer.add_document(tantivy.Document(
+ body=[doc], # this is the raw text of the trec document
+ doc_id=i
+ ))
+ if (i + 1) % 100000 == 0:
+ writer.commit()
+ print('Indexed {} docs'.format(i + 1))
+ writer.commit()
+ print('Built sparse index')
+ index.reload()
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description='Index documents')
+ parser.add_argument('--index_path', type=str, help='path to store the index')
+ parser.add_argument('--document_path', type=str, help='path for documents to index')
+ args = parser.parse_args()
+
+ main(args.index_path, args.document_path)
diff --git a/macaw/cis.py b/macaw/cis.py
index a36ab54..c2a0f5d 100644
--- a/macaw/cis.py
+++ b/macaw/cis.py
@@ -1,15 +1,19 @@
-"""
-The CIS class.
-
-Authors: Hamed Zamani (hazamani@microsoft.com)
-"""
-
from abc import ABC, abstractmethod
+from datetime import datetime
+from typing import List
+import logging
+
from func_timeout import FunctionTimedOut
+from core.response.handler import ResponseGeneratorHandler
from macaw import interface, util
-from macaw.core.interaction_handler.user_requests_db import InteractionDB
+from macaw.core.dialogue_manager.dialogue_manager import DialogManager
+from macaw.core.response.action_detection import RequestDispatcher
+from macaw.core.interaction_handler import CurrentAttributes
from macaw.core.interaction_handler.msg import Message
+from macaw.core.interaction_handler.user_requests_db import InteractionDB
+from macaw.core.nlp_pipeline.nlp_pipeline import NlpPipeline
+from macaw.core.output_handler import naive_output_selection
class CIS(ABC):
@@ -22,75 +26,83 @@ def __init__(self, params):
params(dict): A dict containing some parameters.
"""
self.params = params
- if params['mode'] == 'live':
- self.params['live_request_handler'] = self.live_request_handler
- self.msg_db = InteractionDB(host=self.params['interaction_db_host'],
- port=self.params['interaction_db_port'],
- dbname=self.params['interaction_db_name'])
- elif params['mode'] == 'exp':
- self.params['experimental_request_handler'] = self.request_handler_func
+ if params["mode"] == "live":
+ self.params["live_request_handler"] = self.live_request_handler
+ elif params["mode"] == "exp":
+ self.params["experimental_request_handler"] = self.file_request_handler
+
+ self.msg_db = InteractionDB(
+ host=self.params["interaction_db_host"],
+ port=self.params["interaction_db_port"],
+ dbname=self.params["interaction_db_name"],
+ )
+ self.logger = logging.getLogger("MacawLogger")
+ self.params["curr_attrs"] = self.curr_attrs = CurrentAttributes()
+ self.params["actions"] = self.generate_actions()
self.interface = interface.get_interface(params)
+ self.request_dispatcher = RequestDispatcher(self.params)
+ self.output_selection = naive_output_selection.NaiveOutputProcessing({})
+ self.dialogue_manager = DialogManager()
+ self.nlp_pipeline = NlpPipeline(params.get("nlp_models", {}))
+ self.response_generator_handler = ResponseGeneratorHandler(params.get("response_generator_models", {}))
+
try:
self.nlp_util = util.NLPUtil(self.params)
- self.params['nlp_util'] = self.nlp_util
+ self.params["nlp_util"] = self.nlp_util
except Exception as ex:
- self.params['logger'].warning('WARNING: There is a problem with setting up the NLP utility module.')
- self.timeout = self.params['timeout'] if 'timeout' in self.params else -1
+ self.logger.warning(
+ f"There is a problem with setting up the NLP utility module. {type(ex)}, {ex}"
+ )
+ self.timeout = self.params["timeout"] if "timeout" in self.params else -1
- def live_request_handler(self, msg):
+ def live_request_handler(self, msg: Message):
try:
- # load conversation from the database and add the current message to the database
- conv = [msg] + self.msg_db.get_conv_history(user_id=msg.user_id, max_time=10 * 60 * 1000, max_count=10)
- self.msg_db.insert_one(msg)
+ # load conversation from the database.
+ history = self.msg_db.get_conv_history(
+ user_id=msg.user_id, max_time=10*60*1000, max_count=10
+ )
+
+ conv = [msg] + history
# output_msg = func_timeout(self.timeout, self.request_handler_func, args=[conv])
output_msg = self.request_handler_func(conv)
+
+ # Save the output and conversation state in DB.
self.msg_db.insert_one(output_msg)
return output_msg
except FunctionTimedOut:
+ print(f"live_request_handler method timed out.")
msg_info = dict()
- msg_info['msg_id'] = msg.msg_info['msg_id']
- msg_info['msg_source'] = 'system'
- msg_info['msg_type'] = 'error'
- text = 'Time out, no result!'
- timestamp = util.current_time_in_milliseconds()
- error_msg = Message(msg.user_interface, msg.user_id, msg.user_info, msg_info, text, timestamp)
+ msg_info["msg_id"] = msg.msg_info["msg_id"]
+ msg_info["msg_source"] = "system"
+ msg_info["msg_type"] = "error"
+ text = "Time out, no result!"
+ timestamp = datetime.utcnow()
+ error_msg = Message(
+ user_interface=msg.user_interface,
+ user_id=msg.user_id,
+ user_info=msg.user_info,
+ msg_info=msg_info,
+ text=text,
+ timestamp=timestamp,
+ )
self.msg_db.insert_one(error_msg)
return error_msg
- # def experimental_request_handler(self, str_list):
- # if not isinstance(str_list, list):
- # raise Exception('The input should be a list!')
- #
- # conv_list = []
- # for i in range(len(str_list)):
- # if not isinstance(str_list[i], str):
- # raise Exception('Each element of the input should be a string!')
- # user_info = {'first_name': 'NONE'}
- # msg_info = {'msg_id': -1,
- # 'msg_type': 'command' if str_list[i].startswith('#') else 'text',
- # 'msg_source': 'user'}
- # msg = Message(user_interface='NONE',
- # user_id=-1,
- # user_info=user_info,
- # msg_info=msg_info,
- # text=str_list[i],
- # timestamp=util.current_time_in_milliseconds())
- # conv_list.append(msg)
- # conv_list.reverse()
- #
- # if self.timeout > 0:
- # output_msg = func_timeout(self.timeout, self.request_handler_func, args=[conv_list])
- # else:
- # output_msg = self.request_handler_func(conv_list)
- # return output_msg
+ def file_request_handler(self, msg: Message):
+ # Just delegate the call to the live request handler.
+ return self.live_request_handler(msg)
@abstractmethod
- def request_handler_func(self, conv_list):
+ def request_handler_func(self, conv_list: List[Message]) -> Message:
pass
@abstractmethod
def run(self):
- pass
\ No newline at end of file
+ pass
+
+ @abstractmethod
+ def generate_actions(self) -> dict:
+ pass
diff --git a/macaw/core/input_handler/__init__.py b/macaw/core/dialogue_manager/__init__.py
similarity index 100%
rename from macaw/core/input_handler/__init__.py
rename to macaw/core/dialogue_manager/__init__.py
diff --git a/macaw/core/dialogue_manager/dialogue_manager.py b/macaw/core/dialogue_manager/dialogue_manager.py
new file mode 100644
index 0000000..d64f48f
--- /dev/null
+++ b/macaw/core/dialogue_manager/dialogue_manager.py
@@ -0,0 +1,29 @@
+from core.dialogue_manager.dst import DST, State
+
+
+class DialogManager(object):
+ def __init__(self, dst=None):
+ self.dst = DST() if dst is None else dst
+
+ @classmethod
+ def decode(cls, encoded: dict):
+ return DialogManager(DST.decode(encoded["dst"]))
+
+ def process_turn(self, nlp_pipeline_output: dict):
+ # Get these from the nlp_pipeline output.
+ input_alphabet = None
+ context = None
+
+ new_state = self.dst.transition(input_alphabet, context)
+ if isinstance(new_state, State):
+ self.dst.update(new_state)
+ print(f"new_state={new_state}")
+ else:
+ print(f"Couldn't change state: {new_state}")
+ pass
+ # Save output result in conversation context.
+
+ def encode(self) -> dict:
+ return {
+ "dst": self.dst.encode()
+ }
diff --git a/macaw/core/dialogue_manager/dst.py b/macaw/core/dialogue_manager/dst.py
new file mode 100644
index 0000000..142c60f
--- /dev/null
+++ b/macaw/core/dialogue_manager/dst.py
@@ -0,0 +1,42 @@
+import enum
+
+from typing import Union
+
+
+class State(enum.Enum):
+ launch = 1
+ recipe_selection = 2
+ ingredients = 3
+ show_steps = 4
+ goodbye = 5
+
+
+class DST:
+ def __init__(self, state=None):
+ self.curr_state = State.launch if state is None else state
+
+ @classmethod
+ def decode(cls, encoded: dict):
+ return DST(State(encoded["curr_state"]))
+
+ def transition(self, input_alphabet, context) -> Union[str, State]:
+ """
+ Given the input alphabet and context (and thing that tells about the ongoing conversation), it returns what
+ should the destination state be. Returns message str if there is no valid transition.
+ """
+ if self.curr_state == State.launch:
+ if input_alphabet == 'recipe_question':
+ return State.recipe_selection
+ elif self.curr_state == State.recipe_selection:
+ if input_alphabet == 'option_selection':
+ return State.ingredients
+ return f"Could not make a transition from {self.curr_state} using input {input_alphabet}."
+
+ def update(self, new_state: State):
+ self.curr_state = new_state
+
+ def encode(self) -> dict:
+ return {
+ "curr_state": self.curr_state.value
+ }
+
diff --git a/macaw/core/interaction_handler/__init__.py b/macaw/core/interaction_handler/__init__.py
index 70bb3a1..0bc32cc 100644
--- a/macaw/core/interaction_handler/__init__.py
+++ b/macaw/core/interaction_handler/__init__.py
@@ -1,5 +1,8 @@
"""
-FILE DESCRIPTION
+The interaction handler init.
-Authors: Hamed Zamani (hazamani@microsoft.com)
-"""
\ No newline at end of file
+Authors: Hamed Zamani (hazamani@microsoft.com), George Wei (gzwei@umass.edu)
+"""
+from .attributes import CurrentAttributes, UserAttributes
+from .msg import Message
+from .user_requests_db import InteractionDB
diff --git a/macaw/core/interaction_handler/attributes.py b/macaw/core/interaction_handler/attributes.py
new file mode 100644
index 0000000..77c2040
--- /dev/null
+++ b/macaw/core/interaction_handler/attributes.py
@@ -0,0 +1,23 @@
+"""
+The attribute singleton classes.
+
+Authors: George Wei (gzwei@umass.edu)
+"""
+
+
+class Singleton:
+ def __init__(self, **kwargs):
+ self.__dict__.update(kwargs)
+
+ def __new__(self, **kwargs):
+ if not hasattr(self, "instance"):
+ self.instance = super().__new__(self, **kwargs)
+ return self.instance
+
+
+class CurrentAttributes(Singleton):
+ pass
+
+
+class UserAttributes(Singleton):
+ pass
diff --git a/macaw/core/interaction_handler/msg.py b/macaw/core/interaction_handler/msg.py
index 1ae4d28..a186887 100644
--- a/macaw/core/interaction_handler/msg.py
+++ b/macaw/core/interaction_handler/msg.py
@@ -1,44 +1,88 @@
"""
-The message used to represent each interaction in Macaw.
+The message used to represent each interaction in Macaw. It could be either from the user side or from the bot side.
-Authors: Hamed Zamani (hazamani@microsoft.com)
+Authors: Hamed Zamani (hazamani@microsoft.com), George Wei (gzwei@umass.edu)
"""
+from datetime import datetime
+from typing import Optional, Union, Dict
+
+from .attributes import UserAttributes
+from ..dialogue_manager.dialogue_manager import DialogManager
class Message:
- def __init__(self, user_interface, user_id, user_info, msg_info, text, timestamp):
+ def __init__(
+ self,
+ user_interface: str,
+ user_id: Union[str, int],
+ text: str,
+ timestamp: datetime,
+ response: Optional[str] = None,
+ user_info: Optional[Dict[str, any]] = None,
+ msg_info: Optional[Dict[str, any]] = None,
+ actions_result: Optional[Dict[str, any]] = None,
+ dialog_manager: Optional[DialogManager] = None,
+ nlp_pipeline_result: Optional[Dict[str, any]] = None,
+ user_attributes: Optional[Dict[str, any]] = None,
+ ):
"""
An object for input and output Message.
Args:
user_interface(str): The interface name used for this message (e.g., 'telegram')
- user_id(str or int): The user ID.
- user_info(dict): The dict containing some more information about the user.
- msg_info(dict): The dict containing some more information about the message.
- text(str): The message text.
- timestamp(int): The timestamp of message in milliseconds.
+ user_id(str | int): The user ID.
+ text(str): The user message text.
+ response(str): (Optional) The bot message text. None indicates bot response hasn't been generated yet.
+ timestamp(datetime.datetime): The timestamp of message.
+ user_info(dict): (Optional) The dict containing some more information about the user.
+ msg_info(dict): (Optional) The dict containing some more information about the message.
+ actions_result(dict): (Optional) The results from the various actions given the conversation history.
+ dialog_manager(DialogManager): (Optional) The dialog manager.
+ nlp_pipeline_result(dict): (Optional) The results from running the NLP pipeline.
+ user_attributes(dict): (Optional) The user attributes for this given message.
"""
+ self.user_interface = user_interface
self.user_id = user_id
- self.user_info = user_info
- self.msg_info = msg_info
self.text = text
+ self.response = response
self.timestamp = timestamp
- self.user_interface = user_interface
+ self.user_info = user_info
+ self.msg_info = msg_info
+ self.actions_result = actions_result
+ self.dialog_manager = dialog_manager
+ self.nlp_pipeline_result = nlp_pipeline_result
+
+ if user_attributes is None:
+ user_attributes = dict()
+
+ self.user_attributes = UserAttributes(**user_attributes)
@classmethod
def from_dict(cls, msg_dict):
"""
- Get a Message object from dict.
+ Get a Message object from serialized dict obtained from database (e.g. MongoDB).
Args:
msg_dict(dict): A dict containing all the information required to construct a Message object.
Returns:
A Message object.
"""
- user_interface = msg_dict['user_interface'] if 'user_interface' in msg_dict else None
- user_id = msg_dict['user_id'] if 'user_id' in msg_dict else None
- user_info = msg_dict['user_info'] if 'user_info' in msg_dict else None
- msg_info = msg_dict['msg_info'] if 'msg_info' in msg_dict else None
- text = msg_dict['text'] if 'text' in msg_dict else None
- timestamp = msg_dict['timestamp'] if 'timestamp' in msg_dict else None
- return cls(user_interface, user_id, user_info, msg_info, text, timestamp)
\ No newline at end of file
+ # Deserialize custom variables of Message class.
+ msg_dict["dialog_manager"] = DialogManager.decode(msg_dict["dialog_manager"])
+
+ return cls(**msg_dict)
+
+ def __iter__(self):
+ """
+ This iterable is needed so that Message object can be inserted into a MongoDB table as a dictionary. It should
+ provide encoding logic for all custom data types used inside Message class.
+ """
+ for attr, value in self.__dict__.items():
+ value_ret = value
+ if isinstance(value, UserAttributes):
+ value_ret = value.__dict__
+ elif isinstance(value, DialogManager):
+ value_ret = value.encode()
+ elif isinstance(value, datetime):
+ value_ret = str(value)
+ yield attr, value_ret
diff --git a/macaw/core/interaction_handler/user_requests_db.py b/macaw/core/interaction_handler/user_requests_db.py
index bbb9342..dfab257 100644
--- a/macaw/core/interaction_handler/user_requests_db.py
+++ b/macaw/core/interaction_handler/user_requests_db.py
@@ -1,12 +1,14 @@
"""
The conversation (or interaction) database implemented using MongoDB.
-Authors: Hamed Zamani (hazamani@microsoft.com)
+Authors: Hamed Zamani (hazamani@microsoft.com), George Wei (gzwei@umass.edu)
"""
+from datetime import datetime, timedelta
+from typing import List
+
from pymongo import MongoClient
-from macaw import util
from macaw.core.interaction_handler.msg import Message
@@ -14,23 +16,35 @@ class InteractionDB:
def __init__(self, host, port, dbname):
self.client = MongoClient(host, port)
self.db = self.client[dbname]
- self.col = self.db['macaw_msgs']
+ self.col = self.db["macaw_msgs"]
def insert_one(self, msg):
- if msg.user_id is None or msg.text is None or msg.timestamp is None or msg.user_interface is None:
- raise Exception('Each message should include a user_interface, user_id, text, and timestamp.')
- self.col.insert_one(msg.__dict__)
+ self.col.insert_one(dict(msg))
+
+ def update_one(self, user_id, updates):
+ self.col.find_one_and_update(
+ {"user_id": user_id}, updates, sort=[("timestamp", -1)]
+ )
def get_all(self):
- print('Using get_all is only recommended for development purposes. It is not efficient!')
+ print(
+ "Using get_all is only recommended for development purposes. It is not efficient!"
+ )
return self.dict_list_to_msg_list(self.col.find({}))
- def get_conv_history(self, user_id, max_time, max_count):
+ def get_conv_history(self, user_id, max_time, max_count) -> List[Message]:
if max_time is None:
- res = self.col.find({'user_id': user_id}).sort([('timestamp', -1)])
+ res = self.col.find({"user_id": user_id}, sort=[("timestamp", -1)])
else:
- res = self.col.find({'user_id': user_id,
- 'timestamp': {'$gt': util.current_time_in_milliseconds() - max_time}}).sort([('timestamp', -1)])
+ res = self.col.find(
+ {
+ "user_id": user_id,
+ "timestamp": {
+ "$gt": datetime.utcnow() - timedelta(minutes=max_time)
+ },
+ },
+ sort=[("timestamp", -1)],
+ )
if max_count is not None:
res = res.limit(max_count)
@@ -40,9 +54,10 @@ def close(self):
self.client.close()
@staticmethod
- def dict_list_to_msg_list(msg_dict_list):
- return [Message.from_dict(msg_dict) for msg_dict in msg_dict_list]
-
-
-
-
+ def dict_list_to_msg_list(msg_dict_list) -> List[Message]:
+ msg_list = []
+ for msg_dict in msg_dict_list:
+ msg_dict.pop("_id")
+ message = Message.from_dict(msg_dict=msg_dict)
+ msg_list.append(message)
+ return msg_list
diff --git a/macaw/core/mrc/__init__.py b/macaw/core/mrc/__init__.py
index a56086b..587747a 100644
--- a/macaw/core/mrc/__init__.py
+++ b/macaw/core/mrc/__init__.py
@@ -3,6 +3,7 @@
Authors: Hamed Zamani (hazamani@microsoft.com)
"""
+import logging
from macaw.core.mrc import drqa_mrc
@@ -17,7 +18,8 @@ def get_mrc_model(params):
Returns:
An MRC object for machine reading comprehension.
"""
- params['logger'].info('The MRC model for QA: ' + params['mrc'])
+ logger = logging.getLogger("MacawLogger")
+ logger.info('The MRC model for QA: ' + params['mrc'])
if params['mrc'] == 'drqa':
return drqa_mrc.DrQA(params)
else:
diff --git a/macaw/core/mrc/drqa_mrc.py b/macaw/core/mrc/drqa_mrc.py
index 8c72659..b3e5c4a 100644
--- a/macaw/core/mrc/drqa_mrc.py
+++ b/macaw/core/mrc/drqa_mrc.py
@@ -78,7 +78,3 @@ def get_results(self, conv_list, doc):
for i, p in enumerate(predictions, 1):
results.append(Document(None, None, p[0], p[1]))
return results
-
-
-
-
diff --git a/macaw/core/nlp_pipeline/__init__.py b/macaw/core/nlp_pipeline/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/macaw/core/nlp_pipeline/nlp_pipeline.py b/macaw/core/nlp_pipeline/nlp_pipeline.py
new file mode 100644
index 0000000..358ade8
--- /dev/null
+++ b/macaw/core/nlp_pipeline/nlp_pipeline.py
@@ -0,0 +1,68 @@
+import json
+import logging
+import multiprocessing
+import time
+from multiprocessing import Pool
+from typing import List
+
+import requests
+
+from core.interaction_handler import Message
+
+
+class RemoteModel:
+ def __init__(self, model_name: str, endpoint: str):
+ self.model_name = model_name
+ self.endpoint = endpoint
+
+ def run(self, request: dict) -> dict:
+ try:
+ response = requests.post(url=self.endpoint, data=json.dumps(request))
+ return response.json()
+ except Exception as e:
+ return {
+ "response": f"Error in post request call for {self.model_name}: {e}",
+ "error": True
+ }
+
+
+class NlpPipeline:
+ def __init__(self, modules: dict):
+ self.modules = dict()
+ self.logger = logging.getLogger("MacawLogger")
+ for model_name, endpoint in modules.items():
+ self.modules[model_name] = RemoteModel(model_name, endpoint)
+
+ def run(self, conv_list: List[Message]):
+ """
+ Runs all the models and saves their results in the latest conversation (conv_list[0]) message.
+ """
+ nlp_pipeline_result = {}
+
+ with Pool(processes=4) as pool:
+ all_async_results = []
+ for model_name, model in self.modules.items():
+ # pass in required input to every model.
+ model_input = {"text": "input text for model"}
+ async_result = pool.apply_async(
+ func=model.run,
+ args=(model_input,)
+ )
+ all_async_results.append((model_name, async_result))
+ pool.close()
+
+ max_wait_time_in_secs = 10 # wait time for all modules combined.
+ end_time = time.time() + max_wait_time_in_secs
+ for model_name, async_result in all_async_results:
+ try:
+ nlp_pipeline_result[model_name] = async_result.get(timeout=max(end_time - time.time(), 0))
+ except multiprocessing.TimeoutError as te:
+ resp_msg = f"Module {model_name} timed out."
+ self.logger.error(f"{resp_msg} {te}")
+ nlp_pipeline_result[model_name] = {
+ "response": resp_msg,
+ "error": True,
+ }
+ pool.join()
+
+ conv_list[0].nlp_pipeline_result = nlp_pipeline_result
diff --git a/macaw/core/output_handler/naive_output_selection.py b/macaw/core/output_handler/naive_output_selection.py
index c7487bc..33fe450 100644
--- a/macaw/core/output_handler/naive_output_selection.py
+++ b/macaw/core/output_handler/naive_output_selection.py
@@ -1,12 +1,12 @@
"""
The naive output post processing unit.
-Authors: Hamed Zamani (hazamani@microsoft.com)
+Authors: Hamed Zamani (hazamani@microsoft.com), George Wei (gzwei@umass.edu)
"""
+from datetime import datetime
-from macaw import util
-from macaw.core.output_handler.output_selection import OutputProcessing
from macaw.core.interaction_handler.msg import Message
+from macaw.core.output_handler.output_selection import OutputProcessing
class NaiveOutputProcessing(OutputProcessing):
@@ -35,23 +35,25 @@ def output_selection(self, conv_list, candidate_outputs):
Returns:
A str denoting the selected action. If none is selected, None is returned.
"""
- if '#get_doc' in candidate_outputs:
- return '#get_doc'
- if 'qa' in candidate_outputs:
- if len(candidate_outputs['qa'][0].text) > 0:
- if conv_list[0].text.endswith('?') \
- or conv_list[0].text.lower().startswith('what') \
- or conv_list[0].text.lower().startswith('who') \
- or conv_list[0].text.lower().startswith('when') \
- or conv_list[0].text.lower().startswith('where') \
- or conv_list[0].text.lower().startswith('how'):
- return 'qa'
- if 'retrieval' in candidate_outputs:
- if len(candidate_outputs['retrieval']) > 0:
- return 'retrieval'
+ if "#get_doc" in candidate_outputs:
+ return "#get_doc"
+ if "qa" in candidate_outputs:
+ if len(candidate_outputs["qa"][0].text) > 0:
+ if (
+ conv_list[0].text.endswith("?")
+ or conv_list[0].text.lower().startswith("what")
+ or conv_list[0].text.lower().startswith("who")
+ or conv_list[0].text.lower().startswith("when")
+ or conv_list[0].text.lower().startswith("where")
+ or conv_list[0].text.lower().startswith("how")
+ ):
+ return "qa"
+ if "retrieval" in candidate_outputs:
+ if len(candidate_outputs["retrieval"]) > 0:
+ return "retrieval"
return None
- def get_output(self, conv, candidate_outputs):
+ def get_output(self, conv_list, candidate_outputs):
"""
The response Message generation method.
@@ -65,35 +67,46 @@ def get_output(self, conv, candidate_outputs):
Returns:
A response Message to be sent to the user.
"""
- user_id = conv[0].user_id
- user_info = conv[0].user_info
+ user_id = conv_list[0].user_id
+ user_info = conv_list[0].user_info
msg_info = dict()
- msg_info['msg_id'] = conv[0].msg_info['msg_id']
- msg_info['msg_source'] = 'system'
- text = ''
- user_interface = conv[0].user_interface
+ msg_info["msg_id"] = conv_list[0].msg_info["msg_id"]
+ msg_info["msg_source"] = "system"
+ response = ""
+ user_interface = conv_list[0].user_interface
- selected_action = self.output_selection(conv, candidate_outputs)
+ selected_action = self.output_selection(conv_list, candidate_outputs)
if selected_action is None:
- msg_info['msg_type'] = 'text'
- msg_info['msg_creator'] = 'no answer error'
- text = 'No response has been found! Please try again!'
- elif selected_action == 'qa':
- msg_info['msg_type'] = conv[0].msg_info['msg_type']
- msg_info['msg_creator'] = 'qa'
- text = candidate_outputs['qa'][0].text
- elif selected_action == 'retrieval':
- msg_info['msg_type'] = 'options'
- msg_info['msg_creator'] = 'retrieval'
- text = 'Retrieved document list (click to see the document content):'
- msg_info['options'] = [(output.title, '#get_doc ' + output.id, output.score) for output in candidate_outputs['retrieval']]
- elif selected_action == '#get_doc':
- msg_info['msg_type'] = 'text'
- msg_info['msg_creator'] = '#get_doc'
- text = candidate_outputs['#get_doc'][0].text
+ msg_info["msg_type"] = "text"
+ msg_info["msg_creator"] = "no answer error"
+ response = "No response has been found! Please try again!"
+ elif selected_action == "qa":
+ msg_info["msg_type"] = conv_list[0].msg_info["msg_type"]
+ msg_info["msg_creator"] = "qa"
+ response = candidate_outputs["qa"][0].text
+ elif selected_action == "retrieval":
+ msg_info["msg_type"] = "options"
+ msg_info["msg_creator"] = "retrieval"
+ response = "Retrieved document list (click to see the document content):"
+ msg_info["options"] = [
+ (output.title, "#get_doc " + output.id, output.score)
+ for output in candidate_outputs["retrieval"]
+ ]
+ elif selected_action == "#get_doc":
+ msg_info["msg_type"] = "text"
+ msg_info["msg_creator"] = "#get_doc"
+ response = candidate_outputs["#get_doc"][0].text
else:
- raise Exception('The candidate output key is not familiar!')
- timestamp = util.current_time_in_milliseconds()
- if timestamp <= conv[0].timestamp:
- raise Exception('There is a problem in the output timestamp!')
- return Message(user_interface, user_id, user_info, msg_info, text, timestamp)
\ No newline at end of file
+ raise Exception("The candidate output key is not familiar!")
+ timestamp = datetime.utcnow()
+ if timestamp <= conv_list[0].timestamp:
+ raise Exception("There is a problem in the output timestamp!")
+ return Message(
+ user_interface=user_interface,
+ user_id=user_id,
+ user_info=user_info,
+ msg_info=msg_info,
+ text=conv_list[0].text,
+ response=response,
+ timestamp=timestamp
+ )
diff --git a/macaw/core/response/__init__.py b/macaw/core/response/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/macaw/core/input_handler/action_detection.py b/macaw/core/response/action_detection.py
similarity index 95%
rename from macaw/core/input_handler/action_detection.py
rename to macaw/core/response/action_detection.py
index a49439f..dc4810e 100644
--- a/macaw/core/input_handler/action_detection.py
+++ b/macaw/core/response/action_detection.py
@@ -6,7 +6,7 @@
import multiprocessing
-from macaw.core.input_handler import actions
+from macaw.core.response import actions
class PreActionRequestDispatcher:
@@ -99,7 +99,10 @@ def dispatch(self, conv_list):
manager = multiprocessing.Manager()
action_results = manager.dict()
for action in self.params['actions']:
- p = multiprocessing.Process(target=actions.run_action, args=[action, conv_list.copy(), self.params, action_results])
+ p = multiprocessing.Process(
+ target=actions.run_action,
+ args=[action, conv_list.copy(), self.params, action_results]
+ )
action_processes.append(p)
p.start()
@@ -107,7 +110,7 @@ def dispatch(self, conv_list):
p.join()
candidate_outputs = dict()
- for key in action_results:
+ for key in action_results.keys():
if action_results[key]:
candidate_outputs[key] = action_results[key]
return candidate_outputs
diff --git a/macaw/core/input_handler/actions.py b/macaw/core/response/actions.py
similarity index 85%
rename from macaw/core/input_handler/actions.py
rename to macaw/core/response/actions.py
index 1b8fb38..d877699 100644
--- a/macaw/core/input_handler/actions.py
+++ b/macaw/core/response/actions.py
@@ -4,9 +4,10 @@
Authors: Hamed Zamani (hazamani@microsoft.com)
"""
-from abc import ABC, abstractmethod
-from func_timeout import func_timeout, FunctionTimedOut
import traceback
+from abc import ABC, abstractmethod
+
+from func_timeout import FunctionTimedOut, func_timeout
class Action(ABC):
@@ -38,7 +39,7 @@ def run(conv_list, params):
Returns:
A list of Documents.
"""
- return params['actions']['retrieval'].get_results(conv_list)
+ return params["actions"]["retrieval"].get_results(conv_list)
class GetDocFromIndex(Action):
@@ -54,7 +55,7 @@ def run(conv_list, params):
Returns:
A list of Documents with a length of 1.
"""
- return params['actions']['retrieval'].get_doc_from_index(params['doc_id'])
+ return params["actions"]["retrieval"].get_doc_from_index(params["doc_id"])
class QAAction(Action):
@@ -73,12 +74,12 @@ def run(conv_list, params):
"""
doc_list = RetrievalAction.run(conv_list, params)
- doc = ''
+ doc = ""
for i in range(len(doc_list)):
doc = doc_list[i].text
if len(doc.strip()) > 0:
break
- return params['actions']['qa'].get_results(conv_list, doc)
+ return params["actions"]["qa"].get_results(conv_list, doc)
def run_action(action, conv_list, params, return_dict):
@@ -93,17 +94,21 @@ def run_action(action, conv_list, params, return_dict):
return_dict(dict): A shared dict for all processes running this action. The actions' outputs should be added to
this dict.
"""
- if action == 'retrieval':
+ if action == "retrieval":
action_func = RetrievalAction.run
- elif action == 'qa':
+ elif action == "qa":
action_func = QAAction.run
else:
- raise Exception('Unknown Action!')
+ raise Exception("Unknown Action!")
try:
- return_dict[action] = func_timeout(params['timeout'], action_func, args=[conv_list, params])
+ return_dict[action] = func_timeout(
+ params["timeout"], action_func, args=[conv_list, params]
+ )
except FunctionTimedOut:
- params['logger'].warning('The action "%s" did not respond in %d seconds.', action, params['timeout'])
+ params["logger"].warning(
+ 'The action "%s" did not respond in %d seconds.', action, params["timeout"]
+ )
except Exception:
return_dict[action] = None
traceback.print_exc()
diff --git a/macaw/core/response/docker.py b/macaw/core/response/docker.py
new file mode 100644
index 0000000..fd75cdc
--- /dev/null
+++ b/macaw/core/response/docker.py
@@ -0,0 +1,30 @@
+import json
+
+import requests
+
+from core.response.response_generator import ResponseGenerator
+
+
+class ResponseGeneratorDocker(ResponseGenerator):
+ """
+ A class that encapsulates an ML model running in a local docker container.
+ """
+
+ def __init__(self, name: str, endpoint: str):
+ super().__init__(name)
+ self.endpoint = endpoint
+
+ def run(self, conv_list) -> dict:
+ # extract request from the conversation.
+ request = {
+ "text": "input message for RG docker model."
+ }
+
+ try:
+ response = requests.post(url=self.endpoint, data=json.dumps(request))
+ return response.json()
+ except Exception as e:
+ return {
+ "response": f"Error in post request call for {self.name}: {e}",
+ "error": True
+ }
diff --git a/macaw/core/response/handler.py b/macaw/core/response/handler.py
new file mode 100644
index 0000000..095ed96
--- /dev/null
+++ b/macaw/core/response/handler.py
@@ -0,0 +1,37 @@
+import logging
+from typing import List
+
+from core.response.docker import ResponseGeneratorDocker
+from core.response.response_generator import ResponseGenerator
+
+
+class ResponseGeneratorHandler:
+ def __init__(self, rg_models: dict):
+ self.rg_models = dict()
+ self.logger = logging.getLogger("MacawLogger")
+
+ for model_name, model in rg_models.items():
+ if isinstance(model, ResponseGenerator):
+ self.rg_models[model_name] = model
+ elif isinstance(model, str) and model.startswith("http://"):
+ self.rg_models[model_name] = ResponseGeneratorDocker(model_name, model)
+ else:
+ self.logger.warning(f"Response generator {model_name}:{model} is not supported.")
+
+ def models_selector(self, conv_list) -> List[str]:
+ selected_models = []
+ for model_name in self.rg_models:
+ if model_name == "qa":
+ # decide if qa RG should be run or not here.
+ selected_models.append(model_name)
+ elif model_name == "punctuation":
+ selected_models.append(model_name)
+ else:
+ self.logger.warning(f"Model selector not written for {model_name}. Ignoring the model.")
+ return selected_models
+
+ def run_models(self, model_names: List[str], conv_list) -> dict:
+ models_response = dict()
+ for model_name in model_names:
+ models_response[model_name] = self.rg_models[model_name].run(conv_list)
+ return models_response
diff --git a/macaw/core/response/punctuation.py b/macaw/core/response/punctuation.py
new file mode 100644
index 0000000..ef12b86
--- /dev/null
+++ b/macaw/core/response/punctuation.py
@@ -0,0 +1,34 @@
+import logging
+import time
+
+from macaw.util.custom_logging import LoggerFactory
+from deepmultilingualpunctuation import PunctuationModel
+from core.response.response_generator import ResponseGenerator
+
+
+class ResponseGeneratorPunctuation(ResponseGenerator):
+ def __init__(self, name):
+ super().__init__(name)
+ self.logger = LoggerFactory.create_logger(params={
+ "logger_name": "macaw.core.response.response_generator.punctuation",
+ "logger_level": logging.INFO
+ })
+
+ self.logger.info("Going to download punctuation ML model. This takes time.")
+ # self.model = PunctuationModel()
+ self.logger.info("Punctuation model loaded successfully.")
+
+ def run(self, conv_list) -> dict:
+ # generate input text from the input conversation.
+ input_text = "My name is Clara and I live in Berkeley California Ist das eine Frage Frau Müller"
+
+ t0 = time.time()
+ # result = self.model.restore_punctuation(input_text)
+ result = "punctuation model output."
+ duration = time.time() - t0
+
+ return {
+ "response": f"Local RG: {self.name}. {result}",
+ "error": False,
+ "performance": duration
+ }
diff --git a/macaw/core/response/response_generator.py b/macaw/core/response/response_generator.py
new file mode 100644
index 0000000..a97aef7
--- /dev/null
+++ b/macaw/core/response/response_generator.py
@@ -0,0 +1,14 @@
+from abc import ABC, abstractmethod
+
+
+class ResponseGenerator(ABC):
+ """
+ An abstract class for response generator (RG). It can be implemented as a local RG using a local class or as a
+ remote RG using a docker container to generate the response.
+ """
+ def __init__(self, name: str):
+ self.name = name
+
+ @abstractmethod
+ def run(self, conv_list) -> dict:
+ pass
diff --git a/macaw/core/retrieval/__init__.py b/macaw/core/retrieval/__init__.py
index 994182e..2c0dabe 100644
--- a/macaw/core/retrieval/__init__.py
+++ b/macaw/core/retrieval/__init__.py
@@ -3,16 +3,18 @@
Authors: Hamed Zamani (hazamani@microsoft.com)
"""
+import logging
+
import macaw.core.retrieval.bing_api
-import macaw.core.retrieval.indri
-from macaw.core.retrieval import search_engine, query_generation
+import macaw.core.retrieval.tantivy
+from macaw.core.retrieval import query_generation
def get_retrieval_model(params):
"""
This method returns the Retrieval class requested in the parameter dict.
Args:
- params(dict): A dict of parameters. In this method, the parameters 'logger' and 'query_generation', and
+ params(dict): A dict of parameters. In this method, the parameters 'query_generation', and
'search_engine' are required. Based on the requested retrievel model, some more parameters may be mandatory.
Currently, Macaw serves two different search engines. One is based on indri (http://lemurproject.org/indri.php),
and the other one is the Microsoft Bing API. If you want to retrieve results from your own document collection,
@@ -21,24 +23,34 @@ def get_retrieval_model(params):
Returns:
A Retrieval object for document retrieval.
"""
- params['logger'].info('The query generation model for retrieval: ' + params['query_generation'])
- if params['query_generation'] == 'simple':
+ logger = logging.getLogger("MacawLogger")
+ logger.info(
+ "The query generation model for retrieval: " + params["query_generation"]
+ )
+ if params["query_generation"] == "simple":
q_generation = query_generation.SimpleQueryGeneration(params)
else:
- raise Exception('The requested query generation model does not exist!')
+ raise Exception("The requested query generation model does not exist!")
- params['logger'].info('The search engine for retrieval: ' + params['search_engine'])
- if params['search_engine'] == 'indri':
- return macaw.core.retrieval.indri.Indri({'query_generation': q_generation,
- 'indri_path': params['search_engine_path'],
- 'index': params['col_index'],
- 'text_format': params['col_text_format'],
- 'results_requested': params['results_requested'],
- 'logger': params['logger']})
- elif params['search_engine'] == 'bing':
- return macaw.core.retrieval.bing_api.BingWebSearch({'query_generation': q_generation,
- 'bing_key': params['bing_key'],
- 'results_requested': params['results_requested'],
- 'logger': params['logger']})
+ logger.info("The search engine for retrieval: " + params["search_engine"])
+ if params["search_engine"] == "tantivy":
+ return macaw.core.retrieval.tantivy.Tantivy(
+ {
+ "query_generation": q_generation,
+ "path": params["search_engine_path"],
+ "load": True,
+ "results_requested": params["results_requested"],
+ "logger": logger,
+ }
+ )
+ elif params["search_engine"] == "bing":
+ return macaw.core.retrieval.bing_api.BingWebSearch(
+ {
+ "query_generation": q_generation,
+ "bing_key": params["bing_key"],
+ "results_requested": params["results_requested"],
+ "logger": logger,
+ }
+ )
else:
- raise Exception('The requested retrieval model does not exist!')
\ No newline at end of file
+ raise Exception(f'{params["search_engine"]} retrieval model does not exist!')
diff --git a/macaw/core/retrieval/indri.py b/macaw/core/retrieval/indri.py
deleted file mode 100644
index 77bb073..0000000
--- a/macaw/core/retrieval/indri.py
+++ /dev/null
@@ -1,79 +0,0 @@
-"""
-The Indri search engine.
-
-Authors: Hamed Zamani (hazamani@microsoft.com)
-"""
-
-import os
-import subprocess
-
-import pyndri
-
-from macaw.core.retrieval.doc import get_trec_doc
-from macaw.core.retrieval.search_engine import Retrieval
-
-
-class Indri(Retrieval):
- def __init__(self, params):
- """
- The Indri retrieval model. Indri is an open-source search engine implemented as part of the lemur project by
- UMass Amherst and CMU. Refer to http://lemurproject.org/indri.php for more information.
- The retrieval model used here is based on language modeling framework and retrieves documents using the query
- likelihood retrieval model [Ponte & Croft; SIGIR 1998] and Dirichlet prior smoothing [Zhai and Lafferty; SIGIR
- 2001]. It is implemented using the Pyndri [Van Gysel et al.; ECIR 2017], which is a python interface to Indri.
- Refer to http://lemurproject.org/indri.php for more information on the Lemur toolkit.
-
- Args:
- params(dict): A dict containing some parameters. Here is the list of all required parameters:
- 'indri_path': The path to the installed Indri toolkit.
- 'index': The path to the Indri index constructed from the collection.
- 'results_requested': The maximum number of requested documents for retrieval. If not given, it is set to 1.
- 'text_format': The text format for document collection (e.g., 'trectext').
- Note that the parameters 'query_generation' and 'logger' are required by the parent class.
- """
- super().__init__(params)
- self.results_requested = self.params['results_requested'] if 'results_requested' in self.params else 1
- self.indri_path = self.params['indri_path']
- self.index = pyndri.Index(self.params['index'])
- self.term2id, self.id2term, self.id2df = self.index.get_dictionary()
- self.id2tf = self.index.get_term_frequencies()
-
- def retrieve(self, query):
- """
- This method retrieve documents in response to the given query.
-
- Args:
- query(str): The query string.
-
- Returns:
- A list of Documents with the maximum length of the 'results_requested' parameter.
- """
- int_results = self.index.query(query, results_requested=self.results_requested)
- results = []
- for int_doc_id, score in int_results:
- # ext_doc_id, content_term_id = self.index.document(int_doc_id)
- # index_content = [self.id2term[term_id] if term_id> 0 else 'UNK' for term_id in content_term_id]
- doc = self.get_doc_from_index(int_doc_id)[0]
- doc.score = score
- doc.id = str(int_doc_id)
- results.append(doc)
- return results
-
- def get_doc_from_index(self, doc_id):
- """
- This method retrieves a document content for a given document id.
-
- Args:
- doc_id(str): The document ID.
-
- Returns:
- A Document from the collection whose ID is equal to the given doc_id. For some reasons, the method returns
- a list of Documents with a length of 1.
- """
- content = subprocess.run([os.path.join(self.indri_path, 'dumpindex/dumpindex'), self.params['index'],
- 'dt', str(doc_id)], stdout=subprocess.PIPE).stdout.decode('UTF-8')
- if self.params['text_format'] == 'trectext':
- doc = get_trec_doc(content)
- else:
- raise Exception('The requested text format is not supported!')
- return [doc]
\ No newline at end of file
diff --git a/macaw/core/retrieval/query_generation.py b/macaw/core/retrieval/query_generation.py
index 3047eb5..b1139b5 100644
--- a/macaw/core/retrieval/query_generation.py
+++ b/macaw/core/retrieval/query_generation.py
@@ -4,8 +4,8 @@
Authors: Hamed Zamani (hazamani@microsoft.com)
"""
-from abc import ABC, abstractmethod
import string
+from abc import ABC, abstractmethod
class QueryGeneration(ABC):
@@ -59,12 +59,14 @@ def get_query(self, conv_list):
"""
# q = ' '.join(msg.text for msg in conv_list)
q = conv_list[0].text
- if 'use_coref' in self.params and self.params['use_coref']:
+ if "use_coref" in self.params and self.params["use_coref"]:
q_coref = self.get_query_coref(conv_list)
for key in q_coref:
- q += ' ' + ' '.join(q_coref[key])
+ q += " " + " ".join(q_coref[key])
- q = q.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))).strip()
+ q = q.translate(
+ str.maketrans(string.punctuation, " " * len(string.punctuation))
+ ).strip()
# print(q)
return q
@@ -84,20 +86,20 @@ def get_query_coref(self, conv_list):
"""
corenlp_coref_result = self.compute_corefs(conv_list)
q_coref = dict()
- last_index = len(corenlp_coref_result['sentences'])
- for key in corenlp_coref_result['corefs']:
+ last_index = len(corenlp_coref_result["sentences"])
+ for key in corenlp_coref_result["corefs"]:
has_coref = False
- for item in corenlp_coref_result['corefs'][key]:
- if item['sentNum'] == last_index:
+ for item in corenlp_coref_result["corefs"][key]:
+ if item["sentNum"] == last_index:
has_coref = True
- text = item['text']
+ text = item["text"]
break
if has_coref:
q_coref[text] = []
- for item in corenlp_coref_result['corefs'][key]:
- if item['sentNum'] == last_index:
+ for item in corenlp_coref_result["corefs"][key]:
+ if item["sentNum"] == last_index:
continue
- q_coref[text].append(item['text'])
+ q_coref[text].append(item["text"])
return q_coref
def compute_corefs(self, conv_list):
@@ -115,15 +117,18 @@ def compute_corefs(self, conv_list):
"""
conv_history = []
for msg in reversed(conv_list):
- if msg.msg_info['msg_source'] == 'user' and msg.msg_info['msg_type'] in ['text', 'voice']:
- temp = msg.text if msg.text.endswith('?') else (msg.text + '?')
+ if msg.msg_info["msg_source"] == "user" and msg.msg_info["msg_type"] in [
+ "text",
+ "voice",
+ ]:
+ temp = msg.text if msg.text.endswith("?") else (msg.text + "?")
conv_history.append(temp)
# elif msg.msg_info['msg_source'] == 'system' and msg.msg_info['msg_type'] == 'text' and len(msg.text.split()) < 30:
# temp = msg.text + '.'
# conv_history.append(temp)
if len(conv_history) == 0:
- raise Exception('The query generation model cannot generate any query! There should be a problem')
- coref_results = self.params['nlp_util'].get_coref(' '.join(conv_history))
+ raise Exception(
+ "The query generation model cannot generate any query! There should be a problem"
+ )
+ coref_results = self.params["nlp_util"].get_coref(" ".join(conv_history))
return coref_results
-
-
diff --git a/macaw/core/retrieval/tantivy.py b/macaw/core/retrieval/tantivy.py
new file mode 100644
index 0000000..1e23c39
--- /dev/null
+++ b/macaw/core/retrieval/tantivy.py
@@ -0,0 +1,52 @@
+"""
+The Tantivy search engine.
+
+Authors: Arkin Dharawat (adharawat@umass.edu)
+"""
+
+import os
+
+import tantivy
+
+from macaw.core.retrieval.doc import get_trec_doc
+from macaw.core.retrieval.search_engine import Retrieval
+
+
+class Tantivy(Retrieval):
+ def __init__(self, params):
+ super().__init__(params)
+ self.results_requested = (
+ self.params["results_requested"]
+ if "results_requested" in self.params
+ else 1
+ )
+
+ if not os.path.exists(params["path"]):
+ os.mkdir(params["path"])
+ schema_builder = tantivy.SchemaBuilder()
+ schema_builder.add_text_field("body", stored=True)
+ schema_builder.add_unsigned_field("doc_id", stored=True)
+ schema = schema_builder.build()
+ self.index = tantivy.Index(schema, path=params["path"], reuse=params["load"])
+ self.searcher = self.index.searcher()
+ self.logger = params["logger"]
+ self.logger.info("Loaded index and searcher")
+
+ def retrieve(self, query):
+ docs = []
+ try:
+ query = self.index.parse_query(query, ["body"])
+ scores = self.searcher.search(query, self.results_requested).hits
+ # docs = [(self.searcher.doc(doc_id)['doc_id'], score) for score, doc_id in scores]
+ docs = [
+ get_trec_doc(self.searcher.doc(doc_id)["body"][0])
+ for _, doc_id in scores
+ ] # convert the raw text into trec-doc
+ except Exception as e:
+ self.logger.error(f"Error on Query {e}")
+ return None
+
+ return docs
+
+ def get_doc_from_index(self, doc_id):
+ return [get_trec_doc(self.searcher.doc(doc_id)["body"])]
diff --git a/macaw/docker/flask_app/Dockerfile b/macaw/docker/flask_app/Dockerfile
new file mode 100644
index 0000000..dce887d
--- /dev/null
+++ b/macaw/docker/flask_app/Dockerfile
@@ -0,0 +1,38 @@
+FROM python:3.8.13
+
+RUN apt-get update && apt-get install -y \
+ nginx \
+ supervisor \
+ gcc \
+ g++ \
+ curl \
+ python3-pip \
+ vim
+
+# update pip
+RUN pip3 install pip --upgrade
+
+# Setup flask application
+RUN mkdir -p /deploy/app
+# copy and install requirements before so that other changes do not require
+# downloading requirements again.
+COPY app/requirements.txt /deploy/app/requirements.txt
+RUN pip3 install -r /deploy/app/requirements.txt
+COPY app /deploy/app
+
+# Setup nginx
+RUN rm /etc/nginx/sites-enabled/default
+COPY config/flask.conf /etc/nginx/sites-available/
+RUN ln -s /etc/nginx/sites-available/flask.conf /etc/nginx/sites-enabled/flask.conf
+RUN echo "daemon off;" >> /etc/nginx/nginx.conf
+
+RUN ln -s /usr/local/bin/gunicorn /usr/bin/gunicorn
+# Setup supervisord
+RUN mkdir -p /var/log/supervisor
+COPY config/supervisord.conf /etc/supervisor/conf.d/supervisord.conf
+#COPY config/gunicorn.conf /etc/supervisor/conf.d/gunicorn.conf
+
+EXPOSE 80
+
+# Start processes
+CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]
diff --git a/macaw/docker/flask_app/app/app.py b/macaw/docker/flask_app/app/app.py
new file mode 100644
index 0000000..9adbaef
--- /dev/null
+++ b/macaw/docker/flask_app/app/app.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import os
+import time
+import logging
+import sys
+from typing import Optional
+
+from flask import Flask, request, Response
+from flask_restful import reqparse, Api, Resource
+
+import remote_module
+
+app = Flask("remote module")
+api = Api(app)
+
+app.logger.addHandler(logging.StreamHandler(sys.stdout))
+app.logger.setLevel(logging.DEBUG)
+
+
+class RemoteModule(Resource):
+
+ def get(self):
+ return 200
+
+ def post(self):
+ t0 = time.time()
+
+ args = request.get_json(force=True)
+ print(f"post request arguments {args}")
+ validation = self.__validate_input(args)
+ if validation:
+ return validation, 500
+
+ ret = {}
+ ret.update(self.__get_response(args))
+ ret["performance"] = time.time() - t0
+ ret["error"] = False
+ return ret, 200
+
+ @staticmethod
+ def __validate_input(args: dict) -> Optional[dict]:
+ message = ""
+ for ctx in remote_module.get_required_context():
+ if not args.get(ctx):
+ message = "Context missing: " + ctx
+ if message:
+ return {
+ "message": message,
+ "error": True
+ }
+ return None
+
+ @staticmethod
+ def __get_response(msg) -> dict:
+ response = remote_module.handle_message(msg)
+ app.logger.info("remote model result: %s", response)
+ return response
+
+
+api.add_resource(RemoteModule, '/')
+
+if __name__ == '__main__':
+ app.run(host="127.0.0.1", port=os.environ.get('REMOTE_MODULE_PORT') or 8001)
diff --git a/macaw/docker/flask_app/app/remote_module.py b/macaw/docker/flask_app/app/remote_module.py
new file mode 100644
index 0000000..589f223
--- /dev/null
+++ b/macaw/docker/flask_app/app/remote_module.py
@@ -0,0 +1,15 @@
+
+required_context = ["text"]
+
+
+def get_required_context():
+ return required_context
+
+
+def handle_message(msg: dict) -> dict:
+ """
+ Returns the response dictionary. Throws appropriate error if the model cannot generate a response.
+ """
+ return {
+ "response": "okay from flask app."
+ }
diff --git a/macaw/docker/flask_app/app/requirements.txt b/macaw/docker/flask_app/app/requirements.txt
new file mode 100644
index 0000000..dea81b5
--- /dev/null
+++ b/macaw/docker/flask_app/app/requirements.txt
@@ -0,0 +1,11 @@
+Flask==2.1.2
+Flask-RESTful==0.3.9
+Werkzeug
+future
+jsonpickle
+numpy
+pandoc
+six
+typing
+ConfigArgParse
+gunicorn
diff --git a/macaw/docker/flask_app/config/flask.conf b/macaw/docker/flask_app/config/flask.conf
new file mode 100644
index 0000000..3b68b1c
--- /dev/null
+++ b/macaw/docker/flask_app/config/flask.conf
@@ -0,0 +1,9 @@
+server {
+ listen 80;
+
+ location / {
+ proxy_pass http://localhost:5001/;
+ proxy_set_header Host $host;
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+ }
+}
diff --git a/macaw/docker/flask_app/config/gunicorn.conf b/macaw/docker/flask_app/config/gunicorn.conf
new file mode 100644
index 0000000..bfecb41
--- /dev/null
+++ b/macaw/docker/flask_app/config/gunicorn.conf
@@ -0,0 +1,3 @@
+[program:gunicorn]
+command=/usr/bin/gunicorn app:app -b localhost:5001
+directory=/deploy/app
diff --git a/macaw/docker/flask_app/config/supervisord.conf b/macaw/docker/flask_app/config/supervisord.conf
new file mode 100644
index 0000000..a32f072
--- /dev/null
+++ b/macaw/docker/flask_app/config/supervisord.conf
@@ -0,0 +1,18 @@
+[supervisord]
+nodaemon=true
+user=root
+
+[program:nginx]
+command=/usr/sbin/nginx
+stdout_logfile=/dev/stdout
+stdout_logfile_maxbytes=0
+stderr_logfile=/dev/stderr
+stderr_logfile_maxbytes=0
+
+[program:gunicorn]
+command=/usr/bin/gunicorn app:app -w 1 -b localhost:5001 --log-level info
+directory=/deploy/app
+stdout_logfile=/dev/stdout
+stdout_logfile_maxbytes=0
+stderr_logfile=/dev/stderr
+stderr_logfile_maxbytes=0
diff --git a/macaw/docker/ic_app/Dockerfile b/macaw/docker/ic_app/Dockerfile
new file mode 100644
index 0000000..dce887d
--- /dev/null
+++ b/macaw/docker/ic_app/Dockerfile
@@ -0,0 +1,38 @@
+FROM python:3.8.13
+
+RUN apt-get update && apt-get install -y \
+ nginx \
+ supervisor \
+ gcc \
+ g++ \
+ curl \
+ python3-pip \
+ vim
+
+# update pip
+RUN pip3 install pip --upgrade
+
+# Setup flask application
+RUN mkdir -p /deploy/app
+# copy and install requirements before so that other changes do not require
+# downloading requirements again.
+COPY app/requirements.txt /deploy/app/requirements.txt
+RUN pip3 install -r /deploy/app/requirements.txt
+COPY app /deploy/app
+
+# Setup nginx
+RUN rm /etc/nginx/sites-enabled/default
+COPY config/flask.conf /etc/nginx/sites-available/
+RUN ln -s /etc/nginx/sites-available/flask.conf /etc/nginx/sites-enabled/flask.conf
+RUN echo "daemon off;" >> /etc/nginx/nginx.conf
+
+RUN ln -s /usr/local/bin/gunicorn /usr/bin/gunicorn
+# Setup supervisord
+RUN mkdir -p /var/log/supervisor
+COPY config/supervisord.conf /etc/supervisor/conf.d/supervisord.conf
+#COPY config/gunicorn.conf /etc/supervisor/conf.d/gunicorn.conf
+
+EXPOSE 80
+
+# Start processes
+CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]
diff --git a/macaw/docker/ic_app/app/app.py b/macaw/docker/ic_app/app/app.py
new file mode 100644
index 0000000..9adbaef
--- /dev/null
+++ b/macaw/docker/ic_app/app/app.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import os
+import time
+import logging
+import sys
+from typing import Optional
+
+from flask import Flask, request, Response
+from flask_restful import reqparse, Api, Resource
+
+import remote_module
+
+app = Flask("remote module")
+api = Api(app)
+
+app.logger.addHandler(logging.StreamHandler(sys.stdout))
+app.logger.setLevel(logging.DEBUG)
+
+
+class RemoteModule(Resource):
+
+ def get(self):
+ return 200
+
+ def post(self):
+ t0 = time.time()
+
+ args = request.get_json(force=True)
+ print(f"post request arguments {args}")
+ validation = self.__validate_input(args)
+ if validation:
+ return validation, 500
+
+ ret = {}
+ ret.update(self.__get_response(args))
+ ret["performance"] = time.time() - t0
+ ret["error"] = False
+ return ret, 200
+
+ @staticmethod
+ def __validate_input(args: dict) -> Optional[dict]:
+ message = ""
+ for ctx in remote_module.get_required_context():
+ if not args.get(ctx):
+ message = "Context missing: " + ctx
+ if message:
+ return {
+ "message": message,
+ "error": True
+ }
+ return None
+
+ @staticmethod
+ def __get_response(msg) -> dict:
+ response = remote_module.handle_message(msg)
+ app.logger.info("remote model result: %s", response)
+ return response
+
+
+api.add_resource(RemoteModule, '/')
+
+if __name__ == '__main__':
+ app.run(host="127.0.0.1", port=os.environ.get('REMOTE_MODULE_PORT') or 8001)
diff --git a/macaw/docker/ic_app/app/remote_module.py b/macaw/docker/ic_app/app/remote_module.py
new file mode 100644
index 0000000..bff776c
--- /dev/null
+++ b/macaw/docker/ic_app/app/remote_module.py
@@ -0,0 +1,15 @@
+
+required_context = ["text"]
+
+
+def get_required_context():
+ return required_context
+
+
+def handle_message(msg: dict) -> dict:
+ """
+ Returns the response dictionary. Throws appropriate error if the model cannot generate a response.
+ """
+ return {
+ "response": "okay from ic model."
+ }
diff --git a/macaw/docker/ic_app/app/requirements.txt b/macaw/docker/ic_app/app/requirements.txt
new file mode 100644
index 0000000..dea81b5
--- /dev/null
+++ b/macaw/docker/ic_app/app/requirements.txt
@@ -0,0 +1,11 @@
+Flask==2.1.2
+Flask-RESTful==0.3.9
+Werkzeug
+future
+jsonpickle
+numpy
+pandoc
+six
+typing
+ConfigArgParse
+gunicorn
diff --git a/macaw/docker/ic_app/config/flask.conf b/macaw/docker/ic_app/config/flask.conf
new file mode 100644
index 0000000..3b68b1c
--- /dev/null
+++ b/macaw/docker/ic_app/config/flask.conf
@@ -0,0 +1,9 @@
+server {
+ listen 80;
+
+ location / {
+ proxy_pass http://localhost:5001/;
+ proxy_set_header Host $host;
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+ }
+}
diff --git a/macaw/docker/ic_app/config/gunicorn.conf b/macaw/docker/ic_app/config/gunicorn.conf
new file mode 100644
index 0000000..bfecb41
--- /dev/null
+++ b/macaw/docker/ic_app/config/gunicorn.conf
@@ -0,0 +1,3 @@
+[program:gunicorn]
+command=/usr/bin/gunicorn app:app -b localhost:5001
+directory=/deploy/app
diff --git a/macaw/docker/ic_app/config/supervisord.conf b/macaw/docker/ic_app/config/supervisord.conf
new file mode 100644
index 0000000..a32f072
--- /dev/null
+++ b/macaw/docker/ic_app/config/supervisord.conf
@@ -0,0 +1,18 @@
+[supervisord]
+nodaemon=true
+user=root
+
+[program:nginx]
+command=/usr/sbin/nginx
+stdout_logfile=/dev/stdout
+stdout_logfile_maxbytes=0
+stderr_logfile=/dev/stderr
+stderr_logfile_maxbytes=0
+
+[program:gunicorn]
+command=/usr/bin/gunicorn app:app -w 1 -b localhost:5001 --log-level info
+directory=/deploy/app
+stdout_logfile=/dev/stdout
+stdout_logfile_maxbytes=0
+stderr_logfile=/dev/stderr
+stderr_logfile_maxbytes=0
diff --git a/macaw/docker/qa_app/Dockerfile b/macaw/docker/qa_app/Dockerfile
new file mode 100644
index 0000000..dce887d
--- /dev/null
+++ b/macaw/docker/qa_app/Dockerfile
@@ -0,0 +1,38 @@
+FROM python:3.8.13
+
+RUN apt-get update && apt-get install -y \
+ nginx \
+ supervisor \
+ gcc \
+ g++ \
+ curl \
+ python3-pip \
+ vim
+
+# update pip
+RUN pip3 install pip --upgrade
+
+# Setup flask application
+RUN mkdir -p /deploy/app
+# copy and install requirements before so that other changes do not require
+# downloading requirements again.
+COPY app/requirements.txt /deploy/app/requirements.txt
+RUN pip3 install -r /deploy/app/requirements.txt
+COPY app /deploy/app
+
+# Setup nginx
+RUN rm /etc/nginx/sites-enabled/default
+COPY config/flask.conf /etc/nginx/sites-available/
+RUN ln -s /etc/nginx/sites-available/flask.conf /etc/nginx/sites-enabled/flask.conf
+RUN echo "daemon off;" >> /etc/nginx/nginx.conf
+
+RUN ln -s /usr/local/bin/gunicorn /usr/bin/gunicorn
+# Setup supervisord
+RUN mkdir -p /var/log/supervisor
+COPY config/supervisord.conf /etc/supervisor/conf.d/supervisord.conf
+#COPY config/gunicorn.conf /etc/supervisor/conf.d/gunicorn.conf
+
+EXPOSE 80
+
+# Start processes
+CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]
diff --git a/macaw/docker/qa_app/app/app.py b/macaw/docker/qa_app/app/app.py
new file mode 100644
index 0000000..eb48d75
--- /dev/null
+++ b/macaw/docker/qa_app/app/app.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import os
+import time
+import logging
+import sys
+from typing import Optional
+
+from flask import Flask, request, Response
+from flask_restful import reqparse, Api, Resource
+
+import remote_module
+
+app = Flask("remote module")
+api = Api(app)
+
+app.logger.addHandler(logging.StreamHandler(sys.stdout))
+app.logger.setLevel(logging.DEBUG)
+
+
+class RemoteModule(Resource):
+
+ def get(self):
+ return 200
+
+ def post(self):
+ t0 = time.time()
+
+ args = request.get_json(force=True)
+ print(f"post request arguments {args}")
+ validation = self.__validate_input(args)
+ if validation:
+ return validation, 500
+
+ ret = {}
+ ret.update(self.__get_response(args))
+ ret["performance"] = time.time() - t0
+ ret["error"] = False
+ return ret, 200
+
+ @staticmethod
+ def __validate_input(args: dict) -> Optional[dict]:
+ message = ""
+ for ctx in remote_module.get_required_context():
+ if not args.get(ctx):
+ message = "Context missing: " + ctx
+ if message:
+ return {
+ "message": message,
+ "error": True
+ }
+ return None
+
+ @staticmethod
+ def __get_response(msg) -> dict:
+ response = remote_module.handle_message(msg)
+ app.logger.info("remote model result: %s", response)
+ return response
+
+
+api.add_resource(RemoteModule, '/')
+
+if __name__ == '__main__':
+ app.run(host="127.0.0.1", port=os.environ.get('REMOTE_MODULE_PORT') or 8003)
diff --git a/macaw/docker/qa_app/app/remote_module.py b/macaw/docker/qa_app/app/remote_module.py
new file mode 100644
index 0000000..af29ba8
--- /dev/null
+++ b/macaw/docker/qa_app/app/remote_module.py
@@ -0,0 +1,15 @@
+
+required_context = ["text"]
+
+
+def get_required_context():
+ return required_context
+
+
+def handle_message(msg: dict) -> dict:
+ """
+ Returns the response dictionary. Throws appropriate error if the model cannot generate a response.
+ """
+ return {
+ "response": "okay response from qa model."
+ }
diff --git a/macaw/docker/qa_app/app/requirements.txt b/macaw/docker/qa_app/app/requirements.txt
new file mode 100644
index 0000000..6ad266d
--- /dev/null
+++ b/macaw/docker/qa_app/app/requirements.txt
@@ -0,0 +1,15 @@
+Flask==2.1.2
+Flask-RESTful==0.3.9
+Werkzeug
+future
+jsonpickle
+numpy
+pandoc
+six
+typing
+ConfigArgParse
+gunicorn
+transformers==4.11.2
+itsdangerous==2.0.1
+torch
+boto3
diff --git a/macaw/docker/qa_app/config/flask.conf b/macaw/docker/qa_app/config/flask.conf
new file mode 100644
index 0000000..3b68b1c
--- /dev/null
+++ b/macaw/docker/qa_app/config/flask.conf
@@ -0,0 +1,9 @@
+server {
+ listen 80;
+
+ location / {
+ proxy_pass http://localhost:5001/;
+ proxy_set_header Host $host;
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+ }
+}
diff --git a/macaw/docker/qa_app/config/gunicorn.conf b/macaw/docker/qa_app/config/gunicorn.conf
new file mode 100644
index 0000000..bfecb41
--- /dev/null
+++ b/macaw/docker/qa_app/config/gunicorn.conf
@@ -0,0 +1,3 @@
+[program:gunicorn]
+command=/usr/bin/gunicorn app:app -b localhost:5001
+directory=/deploy/app
diff --git a/macaw/docker/qa_app/config/supervisord.conf b/macaw/docker/qa_app/config/supervisord.conf
new file mode 100644
index 0000000..a32f072
--- /dev/null
+++ b/macaw/docker/qa_app/config/supervisord.conf
@@ -0,0 +1,18 @@
+[supervisord]
+nodaemon=true
+user=root
+
+[program:nginx]
+command=/usr/sbin/nginx
+stdout_logfile=/dev/stdout
+stdout_logfile_maxbytes=0
+stderr_logfile=/dev/stderr
+stderr_logfile_maxbytes=0
+
+[program:gunicorn]
+command=/usr/bin/gunicorn app:app -w 1 -b localhost:5001 --log-level info
+directory=/deploy/app
+stdout_logfile=/dev/stdout
+stdout_logfile_maxbytes=0
+stderr_logfile=/dev/stderr
+stderr_logfile_maxbytes=0
diff --git a/macaw/interface/__init__.py b/macaw/interface/__init__.py
index efebf6c..cc504ed 100644
--- a/macaw/interface/__init__.py
+++ b/macaw/interface/__init__.py
@@ -4,20 +4,20 @@
Authors: Hamed Zamani (hazamani@microsoft.com)
"""
-from macaw.interface import speech_recognition, telegram, stdio, fileio
+from macaw.interface import fileio, speech_recognition, stdio, telegram
def get_interface(params):
- if 'asr_model' in params and params['asr_model'] == 'google':
- params['asr'] = speech_recognition.GoogleASR(params)
- if 'asg_model' in params and params['asg_model'] == 'google':
- params['asg'] = speech_recognition.GoogleText2Speech(params)
+ if "asr_model" in params and params["asr_model"] == "google":
+ params["asr"] = speech_recognition.GoogleASR(params)
+ if "asg_model" in params and params["asg_model"] == "google":
+ params["asg"] = speech_recognition.GoogleText2Speech(params)
- if params['interface'] == 'telegram':
+ if params["interface"] == "telegram":
return telegram.TelegramBot(params)
- elif params['interface'] == 'stdio':
+ elif params["interface"] == "stdio":
return stdio.StdioInterface(params)
- elif params['interface'] == 'fileio':
+ elif params["interface"] == "fileio":
return fileio.FileioInterface(params)
else:
- raise Exception('The requested interface does not exist!')
\ No newline at end of file
+ raise Exception("The requested interface does not exist!")
diff --git a/macaw/interface/fileio.py b/macaw/interface/fileio.py
index 467fb38..16e90dc 100644
--- a/macaw/interface/fileio.py
+++ b/macaw/interface/fileio.py
@@ -3,11 +3,12 @@
Authors: Hamed Zamani (hazamani@microsoft.com)
"""
-
+import json
import time
+from datetime import datetime
-from macaw.interface.interface import Interface
from macaw.core.interaction_handler.msg import Message
+from macaw.interface.interface import Interface
class FileioInterface(Interface):
@@ -16,46 +17,85 @@ def __init__(self, params):
self.msg_id = int(time.time())
def run(self):
- output_file = open(self.params['output_file_path'], 'w+')
- with open(self.params['input_file_path']) as input_file:
+ with open(self.params["output_file_path"], "w+") as output_file_handler, \
+ open(self.params["verbose_output_file_path"], "w+") as verbose_file_handler, \
+ open(self.params["input_file_path"]) as input_file:
for line in input_file:
- str_list = line.strip().split('\t')
- if len(str_list) < 2:
- raise Exception('Each input line should contain at least 2 elements: a query ID and a query text.')
+ str_list = line.strip().split("\t")
+ if len(str_list) != 2:
+ raise Exception(
+ f"Each input line should contain at least 2 elements: a query ID and a query text."
+ f"Invalid line: {line}"
+ )
qid = str_list[0]
- conv_list = []
- for i in range(1, len(str_list)):
- user_info = {'first_name': 'NONE'}
- msg_info = {'msg_id': qid,
- 'msg_type': 'text',
- 'msg_source': 'user'}
- msg = Message(user_interface='NONE',
- user_id=-1,
- user_info=user_info,
- msg_info=msg_info,
- text=str_list[i],
- timestamp=-1)
- conv_list.append(msg)
- conv_list.reverse()
- output_msg = self.params['experimental_request_handler'](conv_list)
- self.result_presentation(output_msg, {'output_file': output_file, 'qid': qid})
- output_file.close()
-
- def result_presentation(self, output_msg, params):
- qid = params['qid']
- output_file = params['output_file']
- if self.params['output_format'] == 'trec':
- if output_msg.msg_info['msg_type'] == 'options':
- for (i, (option_name, option_id, output_score)) in enumerate(output_msg.msg_info['options']):
- output_file.write(qid + '\tQ0\t' + option_name + '\t' + str(i+1) + '\t' + str(output_score) + '\tmacaw\n')
+ user_info = {"first_name": "NONE"}
+ msg_info = {"msg_id": qid, "msg_type": "text", "msg_source": "user"}
+ msg = Message(
+ user_interface="fileio",
+ user_id=-1,
+ user_info=user_info,
+ msg_info=msg_info,
+ text=str_list[1],
+ timestamp=datetime.utcnow(),
+ )
+ output_msg = self.params["experimental_request_handler"](msg)
+ self.result_presentation(
+ output_msg,
+ {
+ "qid": qid,
+ "output_file_handler": output_file_handler,
+ "verbose_file_handler": verbose_file_handler
+ }
+ )
+
+ def result_presentation(self, response_msg: Message, additional_params: dict):
+ """
+ This method writes the result of this turn into output files. Files are already opened before calling this
+ method and handlers passed in as arguments.
+
+ response_msg: the result message generated for this turn.
+ additional_params: dict having params required by this method not present in the self.params class variable.
+ """
+ qid = additional_params["qid"]
+ output_fh = additional_params["output_file_handler"]
+ verbose_fh = additional_params["verbose_file_handler"]
+
+ verbose_fh.write(json.dumps(dict(response_msg)))
+ verbose_fh.write("\n")
+
+ if self.params["output_format"] == "trec":
+ if response_msg.msg_info["msg_type"] == "options":
+ for (i, (option_name, option_id, output_score)) in enumerate(
+ response_msg.msg_info["options"]
+ ):
+ output_fh.write(
+ qid
+ + "\tQ0\t"
+ + option_name
+ + "\t"
+ + str(i + 1)
+ + "\t"
+ + str(output_score)
+ + "\tmacaw\n"
+ )
else:
- raise Exception('TREC output format is only recognized for retrieval results. '
- 'Therefore, the message type should be options.')
- elif self.params['output_format'] == 'text':
- if output_msg.msg_info['msg_type'] == 'text':
- output_file.write(qid + '\t' + output_msg.text.replace('\n', ' ').replace('\t', ' ') + '\n')
+ raise Exception(
+ "TREC output format is only recognized for retrieval results. "
+ "Therefore, the message type should be options."
+ )
+ elif self.params["output_format"] == "text":
+ msg_type = response_msg.msg_info["msg_type"]
+ if msg_type == "text":
+ output_fh.write(
+ qid
+ + "\t"
+ + response_msg.response.replace("\n", " ").replace("\t", " ")
+ + "\n"
+ )
else:
- raise Exception('text output format is only recognized for text outputs.')
+ raise Exception(
+ f"Text output format is only recognized for text outputs. Found {msg_type}."
+ )
else:
- raise Exception('Unknown output file format!')
+ raise Exception("Unknown output file format!")
diff --git a/macaw/interface/interface.py b/macaw/interface/interface.py
index 63312ab..e2638be 100644
--- a/macaw/interface/interface.py
+++ b/macaw/interface/interface.py
@@ -6,6 +6,8 @@
from abc import ABC, abstractmethod
+from core.interaction_handler import Message
+
class Interface(ABC):
def __init__(self, params):
@@ -16,5 +18,5 @@ def run(self):
pass
@abstractmethod
- def result_presentation(self, response_msg, params):
- pass
\ No newline at end of file
+ def result_presentation(self, response_msg: Message, additional_params: dict):
+ pass
diff --git a/macaw/interface/speech_recognition.py b/macaw/interface/speech_recognition.py
index 52a3ff1..467cfbc 100644
--- a/macaw/interface/speech_recognition.py
+++ b/macaw/interface/speech_recognition.py
@@ -4,30 +4,32 @@
Authors: Hamed Zamani (hazamani@microsoft.com)
"""
-from abc import ABC, abstractmethod
import os
import tempfile
+from abc import ABC, abstractmethod
import speech_recognition as sr
from google.cloud import texttospeech
from pydub import AudioSegment
-def mp3_to_ogg(input_file_name): # caller should delete the file afterwards.
+def mp3_to_ogg(input_file_name): # caller should delete the file afterwards.
ogg_file = tempfile.NamedTemporaryFile(delete=False)
- AudioSegment.from_mp3(input_file_name).export(ogg_file.name, format='ogg', parameters=["-acodec", "libopus"])
+ AudioSegment.from_mp3(input_file_name).export(
+ ogg_file.name, format="ogg", parameters=["-acodec", "libopus"]
+ )
ogg_file.close()
return ogg_file.name
-def ogg_to_wav(input_file_name): # caller should delete the file afterwards.
+def ogg_to_wav(input_file_name): # caller should delete the file afterwards.
wav_file = tempfile.NamedTemporaryFile(delete=False)
- AudioSegment.from_ogg(input_file_name).export(wav_file.name, format='wav')
+ AudioSegment.from_ogg(input_file_name).export(wav_file.name, format="wav")
wav_file.close()
return wav_file.name
-class ASR(ABC): # Automatic Speech Recognition
+class ASR(ABC): # Automatic Speech Recognition
def __init__(self, params):
self.params = params
@@ -36,7 +38,7 @@ def speech_to_text(self, file_path):
pass
-class ASG(ABC): # Automatic Speech Generation
+class ASG(ABC): # Automatic Speech Generation
def __init__(self, params):
self.params = params
@@ -62,23 +64,31 @@ def speech_to_text(self, file_path):
except sr.UnknownValueError:
print("Google Speech Recognition could not understand audio")
except sr.RequestError as e:
- print("Could not request results from Google Speech Recognition service; {0}".format(e))
+ print(
+ "Could not request results from Google Speech Recognition service; {0}".format(
+ e
+ )
+ )
class GoogleText2Speech(ASG):
def __init__(self, params):
super().__init__(params)
- os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = self.params['google-speech-to-text-credential-file']
+ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self.params[
+ "google-speech-to-text-credential-file"
+ ]
# Instantiates a client
self.client = texttospeech.TextToSpeechClient()
# Build the voice request, select the language code ("en-US") and the ssml
# voice gender ("neutral")
self.voice = texttospeech.types.VoiceSelectionParams(
- language_code='en-US',
- ssml_gender=texttospeech.enums.SsmlVoiceGender.NEUTRAL)
+ language_code="en-US",
+ ssml_gender=texttospeech.enums.SsmlVoiceGender.NEUTRAL,
+ )
# Select the type of audio file you want returned
self.audio_config = texttospeech.types.AudioConfig(
- audio_encoding=texttospeech.enums.AudioEncoding.MP3)
+ audio_encoding=texttospeech.enums.AudioEncoding.MP3
+ )
def text_to_speech(self, text):
# Set the text input to be synthesized
@@ -86,11 +96,12 @@ def text_to_speech(self, text):
# Perform the text-to-speech request on the text input with the selected
# voice parameters and audio file type
- response = self.client.synthesize_speech(synthesis_input, self.voice, self.audio_config)
+ response = self.client.synthesize_speech(
+ synthesis_input, self.voice, self.audio_config
+ )
mp3_file = tempfile.NamedTemporaryFile(delete=True)
mp3_file.write(response.audio_content)
ogg_file_name = mp3_to_ogg(mp3_file.name)
mp3_file.close()
return ogg_file_name
-
diff --git a/macaw/interface/stdio.py b/macaw/interface/stdio.py
index aca7aa0..149dfb2 100644
--- a/macaw/interface/stdio.py
+++ b/macaw/interface/stdio.py
@@ -1,15 +1,15 @@
"""
The STDIO interface for interactive CIS.
-Authors: Hamed Zamani (hazamani@microsoft.com)
+Authors: Hamed Zamani (hazamani@microsoft.com), George Wei (gzwei@umass.edu)
"""
-
+import json
import time
import traceback
+from datetime import datetime
-from macaw import util
-from macaw.interface.interface import Interface
from macaw.core.interaction_handler.msg import Message
+from macaw.interface.interface import Interface
class StdioInterface(Interface):
@@ -18,44 +18,60 @@ def __init__(self, params):
self.msg_id = int(time.time())
def run(self):
- while True:
- try:
- request = input('ENTER YOUR COMMAND: ').strip()
- if len(request) == 0:
- continue
- user_info = {'first_name': 'STDIO',
- 'is_bot': 'False'
- }
- msg_info = {'msg_id': self.msg_id,
- 'msg_type': 'command' if request.startswith('#') else 'text',
- 'msg_source': 'user'}
- self.msg_id += 1
- msg = Message(user_interface='stdio',
- user_id=-1,
- user_info=user_info,
- msg_info=msg_info,
- text=request,
- timestamp=util.current_time_in_milliseconds())
- output = self.params['live_request_handler'](msg)
- self.result_presentation(output, {})
- except Exception as ex:
- traceback.print_exc()
+ print(f"Inside StdioInterface run method.")
+ with open(self.params["verbose_output_file_path"], "w+") as verbose_file:
+ while True:
+ try:
+ request = input("ENTER YOUR COMMAND (type 'exit' to leave): ").strip()
+ if request == "exit":
+ break
+ if len(request) == 0:
+ continue
+ user_info = {"first_name": "STDIO", "is_bot": "False"}
+ msg_info = {
+ "msg_id": self.msg_id,
+ "msg_type": "command" if request.startswith("#") else "text",
+ "msg_source": "user",
+ }
+ self.msg_id += 1
+ msg = Message(
+ user_interface="stdio",
+ user_id=-1,
+ text=request,
+ timestamp=datetime.utcnow(),
+ user_info=user_info,
+ msg_info=msg_info,
+ )
+ output = self.params["live_request_handler"](msg)
+ self.result_presentation(output, {"verbose_file_handler": verbose_file})
+ except Exception as ex:
+ traceback.print_exc()
+
+ def result_presentation(self, response_msg: Message, additional_params: dict):
+ verbose_fh = additional_params["verbose_file_handler"]
+ verbose_fh.write(json.dumps(dict(response_msg)) + "\n")
- def result_presentation(self, response_msg, params):
try:
- print('THE RESPONSE STARTS')
- print('----------------------------------------------------------------------')
- if response_msg.msg_info['msg_type'] == 'text':
- print(response_msg.text)
- elif response_msg.msg_info['msg_type'] == 'options':
- for (option_text, option_data, output_score) in response_msg.msg_info['options']:
- print(option_data, ' | ', option_text)
- elif response_msg.msg_info['msg_type'] == 'error':
- print('ERROR: NO RESULT!')
+ print("THE RESPONSE STARTS")
+ print(
+ "----------------------------------------------------------------------"
+ )
+ if response_msg.msg_info["msg_type"] == "text":
+ print(response_msg.response)
+ elif response_msg.msg_info["msg_type"] == "options":
+ for (option_text, option_data, output_score) in response_msg.msg_info[
+ "options"
+ ]:
+ print(option_data, " | ", option_text)
+ elif response_msg.msg_info["msg_type"] == "error":
+ print("ERROR: NO RESULT!")
else:
- raise Exception('The msg_type is not recognized:', response_msg.msg_info['msg_type'])
- print('----------------------------------------------------------------------')
- print('THE RESPONSE STARTS')
+ raise Exception(
+ "The msg_type is not recognized:", response_msg.msg_info["msg_type"]
+ )
+ print(
+ "----------------------------------------------------------------------"
+ )
+ print("THE RESPONSE ENDS")
except Exception as ex:
traceback.print_exc()
-
diff --git a/macaw/interface/telegram.py b/macaw/interface/telegram.py
index b6847a8..d0951c8 100644
--- a/macaw/interface/telegram.py
+++ b/macaw/interface/telegram.py
@@ -1,18 +1,24 @@
"""
The Telegram bot (supports interactive multi-modal interactions with different devices).
-Authors: Hamed Zamani (hazamani@microsoft.com)
+Authors: Hamed Zamani (hazamani@microsoft.com), George Wei (gzwei@umass.edu)
"""
-import urllib.parse
import os
import tempfile
import traceback
+import urllib.parse
+from datetime import datetime
from telegram import InlineKeyboardButton, InlineKeyboardMarkup
-from telegram.ext import Updater, CommandHandler, MessageHandler, Filters, CallbackQueryHandler
+from telegram.ext import (
+ CallbackQueryHandler,
+ CommandHandler,
+ Filters,
+ MessageHandler,
+ Updater,
+)
-from macaw import util
from macaw.core.interaction_handler.msg import Message
from macaw.interface.interface import Interface
@@ -26,21 +32,25 @@ def __init__(self, params):
params(dict): A dict of parameters. The params 'logger' and 'bot_token' are mandatory.
"""
super().__init__(params)
- self.logger = self.params['logger']
+ self.logger = self.params["logger"]
- self.MAX_MSG_LEN = 1000 # maximum number of characters in each response message.
- self.MAX_OPTION_LEN = 30 # maximum number of characters in each clickable option text.
+ self.MAX_MSG_LEN = (
+ 1000 # maximum number of characters in each response message.
+ )
+ self.MAX_OPTION_LEN = (
+ 30 # maximum number of characters in each clickable option text.
+ )
# Starting the bot by creating the Updater.
# Make sure to set use_context=True to use the new context based callbacks
# If you don't have a bot_token, add 'botfather' to your personal Telegram account and follow the instructions
# to get a token for your bot.
- self.updater = Updater(self.params['bot_token'], use_context=True)
+ self.updater = Updater(self.params["bot_token"], use_context=True)
self.dp = self.updater.dispatcher
# Telegram command handlers (e.g., /start)
- self.dp.add_handler(CommandHandler('start', self.start))
- self.dp.add_handler(CommandHandler('help', self.help))
+ self.dp.add_handler(CommandHandler("start", self.start))
+ self.dp.add_handler(CommandHandler("help", self.help))
# Telegram message handlers
self.dp.add_handler(MessageHandler(Filters.text, self.request_handler))
@@ -52,32 +62,41 @@ def __init__(self, params):
def start(self, update, context):
"""Send a message when the command /start is issued."""
- update.message.reply_text('Hi, welcome to Macaw! Macaw is an open-source extensible framework for '
- 'conversational information seeking. Visit: https://github.com/microsoft/macaw')
+ update.message.reply_text(
+ "Hi, welcome to Macaw! Macaw is an open-source extensible framework for "
+ "conversational information seeking. Visit: https://github.com/microsoft/macaw"
+ )
def help(self, update, context):
"""Send a message when the command /help is issued."""
- update.message.reply_text('Macaw should be able to answer your questions. Just ask a question!')
+ update.message.reply_text(
+ "Macaw should be able to answer your questions. Just ask a question!"
+ )
def request_handler(self, update, context):
"""This method handles all text messages, and asks result_presentation to send the response to the user."""
try:
self.logger.info(update.message)
- user_info = {'first_name': update.message.chat.first_name,
- 'last_name': update.message.chat.last_name,
- 'is_bot': update._effective_user.is_bot
+ user_info = {
+ "first_name": update.message.chat.first_name,
+ "last_name": update.message.chat.last_name,
+ "is_bot": update._effective_user.is_bot,
+ }
+ msg_info = {
+ "msg_id": update.message.message_id,
+ "msg_type": "text",
+ "msg_source": "user",
}
- msg_info = {'msg_id': update.message.message_id,
- 'msg_type': 'text',
- 'msg_source': 'user'}
- msg = Message(user_interface='telegram',
- user_id=update.message.chat.id,
- user_info=user_info,
- msg_info=msg_info,
- text=update.message.text,
- timestamp=util.current_time_in_milliseconds())
- output = self.params['live_request_handler'](msg)
- self.result_presentation(output, {'update': update})
+ msg = Message(
+ user_interface="telegram",
+ user_id=update.message.chat.id,
+ user_info=user_info,
+ msg_info=msg_info,
+ text=update.message.text,
+ timestamp=datetime.utcnow(),
+ )
+ output = self.params["live_request_handler"](msg)
+ self.result_presentation(output, {"update": update})
except Exception:
traceback.print_exc()
@@ -86,25 +105,30 @@ def voice_request_handler(self, update, context):
try:
ogg_file = tempfile.NamedTemporaryFile(delete=True)
update.message.voice.get_file().download(ogg_file.name)
- text = self.params['asr'].speech_to_text(ogg_file.name)
+ text = self.params["asr"].speech_to_text(ogg_file.name)
ogg_file.close()
- update.message.reply_text('Macaw heard: ' + text)
-
- user_info = {'first_name': update.message.chat.first_name,
- 'last_name': update.message.chat.last_name,
- 'is_bot': update._effective_user.is_bot
- }
- msg_info = {'msg_id': update.message.message_id,
- 'msg_type': 'voice',
- 'msg_source': 'user'}
- msg = Message(user_interface='telegram',
- user_id=update.message.chat.id,
- user_info=user_info,
- msg_info=msg_info,
- text=text,
- timestamp=util.current_time_in_milliseconds())
- output = self.params['live_request_handler'](msg)
- self.result_presentation(output, {'update': update})
+ update.message.reply_text("Macaw heard: " + text)
+
+ user_info = {
+ "first_name": update.message.chat.first_name,
+ "last_name": update.message.chat.last_name,
+ "is_bot": update._effective_user.is_bot,
+ }
+ msg_info = {
+ "msg_id": update.message.message_id,
+ "msg_type": "voice",
+ "msg_source": "user",
+ }
+ msg = Message(
+ user_interface="telegram",
+ user_id=update.message.chat.id,
+ user_info=user_info,
+ msg_info=msg_info,
+ text=text,
+ timestamp=datetime.utcnow(),
+ )
+ output = self.params["live_request_handler"](msg)
+ self.result_presentation(output, {"update": update})
except Exception:
traceback.print_exc()
@@ -112,53 +136,78 @@ def button_click_handler(self, update, context):
"""This method handles clicks, and asks result_presentation to send the response to the user."""
try:
self.logger.info(update)
- user_info = {'first_name': update.callback_query.message.chat.first_name,
- 'last_name': update.callback_query.message.chat.last_name,
- 'is_bot': update._effective_user.is_bot
- }
- msg_info = {'msg_id': update.callback_query.message.message_id,
- 'msg_type': 'command',
- 'msg_source': 'user'}
- msg = Message(user_interface='telegram',
- user_id=update.callback_query.message.chat.id,
- user_info=user_info,
- msg_info=msg_info,
- text=update.callback_query.data,
- timestamp=util.current_time_in_milliseconds())
- output = self.params['live_request_handler'](msg)
- self.result_presentation(output, {'update': update})
+ user_info = {
+ "first_name": update.callback_query.message.chat.first_name,
+ "last_name": update.callback_query.message.chat.last_name,
+ "is_bot": update._effective_user.is_bot,
+ }
+ msg_info = {
+ "msg_id": update.callback_query.message.message_id,
+ "msg_type": "command",
+ "msg_source": "user",
+ }
+ msg = Message(
+ user_interface="telegram",
+ user_id=update.callback_query.message.chat.id,
+ user_info=user_info,
+ msg_info=msg_info,
+ text=update.callback_query.data,
+ timestamp=datetime.utcnow(),
+ )
+ output = self.params["live_request_handler"](msg)
+ self.result_presentation(output, {"update": update})
except Exception as ex:
traceback.print_exc()
- def result_presentation(self, response_msg, params):
+ def result_presentation(self, response_msg, additional_params):
"""This method produces an appropriate response to be sent to the client."""
try:
if response_msg is None:
return
- update = params['update']
- if response_msg.msg_info['msg_type'] == 'text':
+ update = additional_params["update"]
+ if response_msg.msg_info["msg_type"] == "text":
if update.message is not None:
- update.message.reply_text(response_msg.text[:self.MAX_MSG_LEN])
+ update.message.reply_text(response_msg.text[: self.MAX_MSG_LEN])
elif update.callback_query.message is not None:
- update.callback_query.message.reply_text(response_msg.text[:self.MAX_MSG_LEN])
- elif response_msg.msg_info['msg_type'] == 'voice':
- ogg_file_name = self.params['asg'].text_to_speech(response_msg.text[:self.MAX_MSG_LEN])
- self.updater.bot.send_voice(chat_id=update.message.chat.id, voice=open(ogg_file_name, 'rb'))
+ update.callback_query.message.reply_text(
+ response_msg.text[: self.MAX_MSG_LEN]
+ )
+ elif response_msg.msg_info["msg_type"] == "voice":
+ ogg_file_name = self.params["asg"].text_to_speech(
+ response_msg.text[: self.MAX_MSG_LEN]
+ )
+ self.updater.bot.send_voice(
+ chat_id=update.message.chat.id, voice=open(ogg_file_name, "rb")
+ )
os.remove(ogg_file_name) # removing audio files for privacy reasons.
- elif response_msg.msg_info['msg_type'] == 'options':
- keyboard = [[InlineKeyboardButton(option_text[:self.MAX_OPTION_LEN],
- callback_data=urllib.parse.unquote(option_data))]
- for (option_text, option_data, output_score) in response_msg.msg_info['options']]
+ elif response_msg.msg_info["msg_type"] == "options":
+ keyboard = [
+ [
+ InlineKeyboardButton(
+ option_text[: self.MAX_OPTION_LEN],
+ callback_data=urllib.parse.unquote(option_data),
+ )
+ ]
+ for (
+ option_text,
+ option_data,
+ output_score,
+ ) in response_msg.msg_info["options"]
+ ]
reply_markup = InlineKeyboardMarkup(keyboard)
- update.message.reply_text(response_msg.text[:self.MAX_MSG_LEN], reply_markup=reply_markup)
- elif response_msg.msg_info['msg_type'] == 'error':
- error_msg = 'ERROR: NO RESULT!'
+ update.message.reply_text(
+ response_msg.text[: self.MAX_MSG_LEN], reply_markup=reply_markup
+ )
+ elif response_msg.msg_info["msg_type"] == "error":
+ error_msg = "ERROR: NO RESULT!"
if update.message is not None:
update.message.reply_text(error_msg)
elif update.callback_query.message is not None:
update.callback_query.message.reply_text(error_msg)
else:
- raise Exception('The msg_type is not recognized:', response_msg.msg_info['msg_type'])
+ raise Exception(
+ "The msg_type is not recognized:", response_msg.msg_info["msg_type"]
+ )
except Exception:
traceback.print_exc()
@@ -173,10 +222,9 @@ def send_msg(self, chat_id, msg_text):
def run(self):
"""Starting the bot!"""
- self.logger.info('Running the Telegram bot!')
+ self.logger.info("Running the Telegram bot!")
self.updater.start_polling()
# Run the bot until you press Ctrl-C or the process receives SIGINT,
# SIGTERM or SIGABRT. This should be used most of the time, since
# start_polling() is non-blocking and will stop the bot gracefully.
self.updater.idle()
-
diff --git a/macaw/live_main.py b/macaw/live_main.py
index bf1ec30..778a863 100644
--- a/macaw/live_main.py
+++ b/macaw/live_main.py
@@ -3,12 +3,14 @@
Authors: Hamed Zamani (hazamani@microsoft.com)
"""
+import argparse
+from typing import List
+from core.interaction_handler import Message
+from core.response.punctuation import ResponseGeneratorPunctuation
from macaw.cis import CIS
from macaw.core import mrc, retrieval
-from macaw.core.input_handler.action_detection import RequestDispatcher
-from macaw.core.output_handler import naive_output_selection
-from macaw.util.logging import Logger
+from macaw.util.custom_logging import LoggerFactory
class ConvQA(CIS):
@@ -24,66 +26,113 @@ def __init__(self, params):
for more information on the required parameters.
"""
super().__init__(params)
- self.logger = params['logger']
- self.logger.info('Conversational QA Model... starting up...')
- self.retrieval = retrieval.get_retrieval_model(params=self.params)
- self.qa = mrc.get_mrc_model(params=self.params)
- self.params['actions'] = {'retrieval': self.retrieval, 'qa': self.qa}
- self.request_dispatcher = RequestDispatcher(self.params)
- self.output_selection = naive_output_selection.NaiveOutputProcessing({})
-
- def request_handler_func(self, conv_list):
+ self.logger.info("Conversational QA Model... starting up...")
+
+ def generate_actions(self) -> dict:
+ return {
+ "retrieval": retrieval.get_retrieval_model(params=self.params),
+ "qa": mrc.get_mrc_model(params=self.params)
+ }
+
+ def request_handler_func(self, conv_list: List[Message]) -> Message:
"""
- This function is called for each conversational interaction made by the user. In fact, this function calls the
- dispatcher to send the user request to the information seeking components.
+ This function is called for each conversational interaction made by the user. It calls the NLP pipeline, DST
+ model and all the actions and saves their result in the latest conversation message.
Args:
- conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the
+ conv_list(list): List of Message, each corresponding to a conversational message from / to the
user. This list is in reverse order, meaning that the first elements is the last interaction made by user.
Returns:
output_msg(Message): Returns an output message that should be sent to the UI to be presented to the user.
+ It is the latest conversation message.
"""
self.logger.info(conv_list)
+
+ self.nlp_pipeline.run(conv_list)
+ self.logger.info(f"nlp pipeline result: {conv_list[0].nlp_pipeline_result}")
+
+ # Run the DST module and save the output in conversation.
+ nlp_pipeline_output = conv_list[0].nlp_pipeline_result
+ self.dialogue_manager.process_turn(nlp_pipeline_output)
+ conv_list[0].dialog_manager = self.dialogue_manager
+
dispatcher_output = self.request_dispatcher.dispatch(conv_list)
+
output_msg = self.output_selection.get_output(conv_list, dispatcher_output)
- return output_msg
+
+ # Run response generator.
+ models_to_run = self.response_generator_handler.models_selector(conv_list)
+ rg_output = self.response_generator_handler.run_models(models_to_run, conv_list)
+ self.logger.info(f"rg_output: {rg_output}")
+
+ # TODO: Select or create response from RG output.
+
+ conv_list[0].msg_info = output_msg.msg_info
+ conv_list[0].response = output_msg.response
+ conv_list[0].timestamp = output_msg.timestamp
+ conv_list[0].actions_result = rg_output
+
+ return conv_list[0]
def run(self):
"""
- This function is called to run the ConvQA system. In live mode, it never stops until the program is killed.
+ This function is called to run the ConvQA system. In live mode, it never stops until the program is killed.
"""
self.interface.run()
-if __name__ == '__main__':
- basic_params = {'timeout': 15, # timeout is in terms of second.
- 'mode': 'live', # mode can be either live or exp.
- 'logger': Logger({})} # for logging into file, pass the filepath to the Logger class.
+if __name__ == "__main__":
+ # Parse input arguments.
+ parser = argparse.ArgumentParser(description="Run live_main.py file")
+ parser.add_argument("--mode", type=str, default="live", help="live or exp (experimental)")
+ parser.add_argument("--interface", type=str, default="stdio", help="can be 'telegram' or 'stdio' for live mode, and"
+ " 'fileio' for exp mode")
+ args = parser.parse_args()
+
+ basic_params = {
+ "timeout": 15, # timeout is in terms of second.
+ "mode": args.mode, # mode can be either live or exp.
+ }
+
+ # for logging into file, pass the filepath to the Logger class.
+ my_logger = LoggerFactory.create_logger({})
# These are required database parameters if the mode is 'live'. The host and port of the machine hosting the
# database, as well as the database name.
- db_params = {'interaction_db_host': 'localhost',
- 'interaction_db_port': 27017,
- 'interaction_db_name': 'macaw_test'}
+ db_params = {
+ "interaction_db_host": "localhost",
+ "interaction_db_port": 27017,
+ "interaction_db_name": "macaw_test",
+ }
# These are interface parameters. They are interface specific.
- interface_params = {'interface': 'telegram', # interface can be 'telegram' or 'stdio' for live mode, and 'fileio'
- # for exp mode.
- 'bot_token': 'YOUR_TELECGRAM_BOT_TOKEN', # Telegram bot token.
- 'asr_model': 'google', # The API used for speech recognition.
- 'asg_model': 'google', # The API used for speech generation.
- 'google-speech-to-text-credential-file': 'YOUR_GOOGLE_CREDENTIAL_FILE'}
+ interface_params = {
+ "interface": args.interface, # interface can be 'telegram' or 'stdio' for live mode, and 'fileio'
+
+ # for experimental mode with fileio interface.
+ "input_file_path": "/usr/src/app/data/file_input.txt",
+ "output_file_path": "/usr/src/app/data/file_output.txt",
+
+ "verbose_output_file_path": "/usr/src/app/data/file_output_verbose.txt",
+ "output_format": "text",
+ "bot_token": "YOUR_TELEGRAM_BOT_TOKEN", # Telegram bot token.
+ # 'asr_model': 'google', # The API used for speech recognition.
+ # 'asg_model': 'google', # The API used for speech generation.
+ "google-speech-to-text-credential-file": "YOUR_GOOGLE_CREDENTIAL_FILE",
+ }
# These are parameters used by the retrieval model.
- retrieval_params = {'query_generation': 'simple', # the model that generates a query from a conversation history.
- 'use_coref': True, # True, if query generator can use coreference resolution, otherwise False.
- 'search_engine': 'bing', # the search engine. It can be either 'indri' or 'bing'.
- 'bing_key': 'YOUR_BING_SUBSCRIPTION_KEY', # Bing API key
- 'search_engine_path': 'PATH_TO_INDRI', # The path to the indri toolkit.
- 'col_index': 'PATH_TO_INDRI_INDEX', # The path to the indri index.
- 'col_text_format': 'trectext', # collection text format. Standard 'trectext' is only supported.
- 'results_requested': 3} # Maximum number of docs that should be retrieved by search engine.
+ retrieval_params = {
+ "query_generation": "simple", # the model that generates a query from a conversation history.
+ "use_coref": True, # True, if query generator can use coreference resolution, otherwise False.
+ "search_engine": "tantivy", # the search engine.
+ "bing_key": "YOUR_BING_SUBSCRIPTION_KEY", # Bing API key
+ "search_engine_path": "tantivy_index/", # The path to the tantivy index.
+ "col_index": "/usr/src/app/indri-5.11/buildindex/my_index", # The path to the indri index.
+ "col_text_format": "trectext", # collection text format. Standard 'trectext' is only supported.
+ "results_requested": 3,
+ } # Maximum number of docs that should be retrieved by search engine.
# Note: If you want to have a re-ranking model (e.g., learning to rank), you just need to simply extend the class
# core.retrieval.search_engine.ReRanker and implement the method 'rerank'. Then simply add a 'reranker' parameter to
# retrieval_params that points to an instance of your favorite ReRanker class. If there is a 'reranker' parameter in
@@ -91,13 +140,34 @@ def run(self):
# 'get_results' in class core.retrieval.search_engine.Retrieval.
# These are parameters used by the machine reading comprehension model.
- mrc_params = {'mrc': 'drqa', # MRC model.
- 'mrc_model_path': 'PATH_TO_PRETRAINED_MRC_MODEL', # The path to the model parameters.
- 'mrc_path': 'PATH_TO_MRC_DIRECTORY', # The path to the model toolkit.
- 'corenlp_path': 'PATH_TO_STANFORD_CORE_NLP_DIRECTORY', # The path to the corenlp toolkit.
- 'qa_results_requested': 3} # The number of candidate answers returned by the MRC model.
-
- params = {**basic_params, **db_params, **interface_params, **retrieval_params, **mrc_params}
- basic_params['logger'].info(params)
+ mrc_params = {
+ "mrc": "drqa", # MRC model.
+ "mrc_model_path": "/usr/src/app/DrQA/data/reader/multitask.mdl", # The path to the model parameters.
+ "mrc_path": "/usr/src/app/DrQA", # The path to the model toolkit.
+ "corenlp_path": "/usr/src/app/stanford-corenlp-full-2017-06-09", # The path to the corenlp toolkit.
+ "qa_results_requested": 3,
+ } # The number of candidate answers returned by the MRC model.
+
+ # ML modules which are part of the NLP pipeline and all response generators.
+ ml_modules = {
+ "nlp_models": {
+ "intent_classification": "http://nlp-pipeline-app-ic:80",
+ "sample_flask": "http://nlp-pipeline-app-flask:80",
+ },
+ "response_generator_models": {
+ "qa": "http://response-generator-app-qa:80",
+ "punctuation": ResponseGeneratorPunctuation(name="punctuation_model")
+ }
+ }
+
+ params = {
+ **basic_params,
+ **db_params,
+ **interface_params,
+ **retrieval_params,
+ **mrc_params,
+ **ml_modules
+ }
+
+ my_logger.info(params)
ConvQA(params).run()
-
diff --git a/macaw/util/__init__.py b/macaw/util/__init__.py
index 5e6e731..07aaaf4 100644
--- a/macaw/util/__init__.py
+++ b/macaw/util/__init__.py
@@ -5,22 +5,15 @@
"""
import json
-import time
+import backoff as backoff
from stanfordcorenlp import StanfordCoreNLP
-def current_time_in_milliseconds():
- """
- A method that returns the current time in milliseconds.
-
- Returns:
- An int representing the current time in milliseconds.
- """
- return int(round(time.time() * 1000))
-
-
class NLPUtil:
+ @backoff.on_exception(backoff.expo,
+ Exception,
+ max_tries=3)
def __init__(self, params):
"""
A simple NLP helper class.
@@ -29,11 +22,15 @@ def __init__(self, params):
params(dict): A dict containing some parameters.
"""
self.params = params
- self.corenlp = StanfordCoreNLP(self.params['corenlp_path'], quiet=False)
+ self.corenlp = StanfordCoreNLP(self.params["corenlp_path"], quiet=False)
# Pre-fetching the required models.
- props = {'annotators': 'coref', 'pipelineLanguage': 'en', 'ner.useSUTime': False}
- self.corenlp.annotate('', properties=props)
+ props = {
+ "annotators": "coref",
+ "pipelineLanguage": "en",
+ "ner.useSUTime": False,
+ }
+ self.corenlp.annotate("", properties=props)
def get_coref(self, text):
"""
@@ -44,7 +41,11 @@ def get_coref(self, text):
Returns:
A json object containing all co-reference resolutions extracted from the input text.
"""
- props = {'annotators': 'coref', 'pipelineLanguage': 'en', 'ner.useSUTime': False}
+ props = {
+ "annotators": "coref",
+ "pipelineLanguage": "en",
+ "ner.useSUTime": False,
+ }
result = json.loads(self.corenlp.annotate(text, properties=props))
return result
diff --git a/macaw/util/custom_logging.py b/macaw/util/custom_logging.py
new file mode 100644
index 0000000..7ac0da7
--- /dev/null
+++ b/macaw/util/custom_logging.py
@@ -0,0 +1,28 @@
+"""
+The internal logger.
+
+Authors: Hamed Zamani (hazamani@microsoft.com)
+"""
+
+import logging
+
+
+class LoggerFactory:
+ @staticmethod
+ def create_logger(params) -> logging.Logger:
+ """
+ Creates a new custom logger with configuration passed in params dictionary. Supported options are
+ 'logger_name', 'logger_level', and 'logger_file'.
+ """
+ logger = logging.getLogger(params.get("logger_name", "MacawLogger"))
+ logger.setLevel(params.get("logger_level", logging.DEBUG))
+
+ if "logger_file" in params:
+ handler = logging.FileHandler(params["logger_file"])
+ else:
+ handler = logging.StreamHandler()
+
+ formatter = logging.Formatter('%(name)s - %(asctime)s - %(levelname)s - %(message)s')
+ handler.setFormatter(formatter)
+ logger.addHandler(handler)
+ return logger
diff --git a/macaw/util/logging.py b/macaw/util/logging.py
deleted file mode 100644
index 156ae77..0000000
--- a/macaw/util/logging.py
+++ /dev/null
@@ -1,28 +0,0 @@
-"""
-The internal logger.
-
-Authors: Hamed Zamani (hazamani@microsoft.com)
-"""
-
-import logging
-
-
-class Logger(logging.Logger):
- def __init__(self, params):
- """
- A simple logging class, inherited from the standard logging.Logger.
-
- Args:
- params(dict): A dict containing some parameters. 'logging_file' is an optional parameter, otherwise STDIO
- will be used for logging.
- """
- super().__init__('Macaw Logger')
- self.params = params
- if 'logging_file' in params:
- self.handler_ = logging.FileHandler(params['logging_file'])
- else:
- self.handler_ = logging.StreamHandler()
-
- self.format = logging.Formatter('%(name)s - %(asctime)s - %(levelname)s - %(message)s')
- self.handler_.setFormatter(self.format)
- self.addHandler(self.handler_)
diff --git a/macaw/util/text_parser.py b/macaw/util/text_parser.py
index 8493bc3..5ae3205 100644
--- a/macaw/util/text_parser.py
+++ b/macaw/util/text_parser.py
@@ -4,9 +4,10 @@
Authors: Hamed Zamani (hazamani@microsoft.com)
"""
-import justext
from xml.etree import cElementTree as ElementTree
+import justext
+
class XmlListConfig(list):
def __init__(self, aList):
@@ -88,6 +89,7 @@ def xml_file_to_dict(xml_file):
# result = filter(visible, data)
# return ' '.join(result)
+
def html_to_clean_text(html):
"""
Converting an HTML document to clean text.
@@ -102,4 +104,4 @@ def html_to_clean_text(html):
for paragraph in paragraphs:
if not paragraph.is_boilerplate:
clean_text_list.append(paragraph.text)
- return '\n'.join(clean_text_list)
+ return "\n".join(clean_text_list)
diff --git a/macaw/wizard_of_oz_main.py b/macaw/wizard_of_oz_main.py
index 08331d5..7324bfb 100644
--- a/macaw/wizard_of_oz_main.py
+++ b/macaw/wizard_of_oz_main.py
@@ -3,15 +3,15 @@
Authors: Hamed Zamani (hazamani@microsoft.com)
"""
-
+import logging
import multiprocessing
from macaw import interface
from macaw.core import retrieval
-from macaw.core.input_handler.action_detection import RequestDispatcher
+from macaw.core.response.action_detection import RequestDispatcher
from macaw.core.interaction_handler.user_requests_db import InteractionDB
from macaw.core.output_handler import naive_output_selection
-from macaw.util.logging import Logger
+from macaw.util.custom_logging import LoggerFactory
class Seeker:
@@ -27,7 +27,7 @@ def __init__(self, params):
for more information on the required parameters.
"""
self.params = params
- self.logger = params['logger']
+ self.logger = logging.getLogger("MacawLogger")
self.logger.info('Conversational Wirzard of Oz System... starting up...')
self.wizard = None
self.params['live_request_handler'] = self.live_request_handler
@@ -87,7 +87,7 @@ def __init__(self, params):
for more information on the required parameters.
"""
self.params = params
- self.logger = params['logger']
+ self.logger = logging.getLogger("MacawLogger")
self.logger.info('Conversational Wirzard of Oz System... starting up...')
self.params['live_request_handler'] = self.live_request_handler
self.seeker = None
@@ -148,9 +148,9 @@ def run(self):
if __name__ == '__main__':
+ my_logger = LoggerFactory.create_logger({})
basic_params = {'timeout': 15, # timeout is in terms of second.
- 'mode': 'live', # mode can be either live or exp.
- 'logger': Logger({})} # for logging into file, pass the filepath to the Logger class.
+ 'mode': 'live'} # mode can be either live or exp.
# These are required database parameters if the mode is 'live'. The host and port of the machine hosting the
# database, as well as the database name.
@@ -190,8 +190,8 @@ def run(self):
seeker_params = {**basic_params, **db_params, **seeker_interface_params, **retrieval_params}
wizard_params = {**basic_params, **db_params, **wizard_interface_params, **retrieval_params}
- basic_params['logger'].info(seeker_params)
- basic_params['logger'].info(wizard_params)
+ my_logger.info(seeker_params)
+ my_logger.info(wizard_params)
seeker = Seeker(seeker_params)
wizard = Wizard(wizard_params)
@@ -204,6 +204,6 @@ def run(self):
seeker_process.start()
wizard_process.start()
- basic_params['logger'].info('Seeker Process ID: {}'.format(seeker_process.pid))
- basic_params['logger'].info('Wizard Process ID: {}'.format(wizard_process.pid))
+ my_logger.info('Seeker Process ID: {}'.format(seeker_process.pid))
+ my_logger.info('Wizard Process ID: {}'.format(wizard_process.pid))
diff --git a/requirements.txt b/requirements.txt
index 4a068f5..856df75 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,3 +6,7 @@ pydub
python-telegram-bot==12.0.0
stanfordcorenlp
google-cloud-texttospeech
+tantivy
+pymongo
+backoff
+deepmultilingualpunctuation
diff --git a/schema.ts b/schema.ts
new file mode 100644
index 0000000..0e1c16f
--- /dev/null
+++ b/schema.ts
@@ -0,0 +1,21 @@
+interface Message {
+ user_interface: string,
+ user_id: string | number, // number as in integer
+ text: string,
+ timestamp: Date, // datetime.utcnow()
+ user_info?: {[user_detail: string]: any},
+ msg_info?: {[msg_detail: string]: any},
+ actions: {
+ [action: string]: string | {[action_var: string]: any}
+ },
+ dialog_state_tracking: {
+ state: string,
+ [dialog_state_tracking_var: string]: string
+ },
+ nlp_pipeline: {
+ [module: string]: {[module_var: string]: any}
+ }
+ user_attributes: {
+ [attribute: string]: any
+ }
+}
\ No newline at end of file
diff --git a/scripts/start.sh b/scripts/start.sh
new file mode 100644
index 0000000..a9f61dd
--- /dev/null
+++ b/scripts/start.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+echo "Inside the execution script."
+
+# Start MongoDB as a daemon process https://docs.mongodb.com/manual/tutorial/manage-mongodb-processes/
+mongod --fork --logpath /var/log/mongodb/mongod.log
+echo "MongoDB started successfully."
+
+echo "This is the python version: $(python3 --version)"
+
+# Start the main application.
+python3 macaw/live_main.py --mode exp --interface fileio
+# Use the below command for terminal IO.
+#python3 macaw/live_main.py --mode live --interface stdio
diff --git a/setup.py b/setup.py
index 8675bdc..c9a82ac 100644
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@
setup(
name='macaw',
version='0.1',
- packages=['macaw', 'macaw.core', 'macaw.core.mrc', 'macaw.core.retrieval', 'macaw.core.input_handler',
+ packages=['macaw', 'macaw.core', 'macaw.core.mrc', 'macaw.core.retrieval', 'macaw.core.response',
'macaw.core.output_handler', 'macaw.core.interaction_handler', 'macaw.util', 'macaw.interface'],
url='https://github.com/microsoft/macaw/',
license='MIT',
diff --git a/trec_documents/.DS_Store b/trec_documents/.DS_Store
new file mode 100644
index 0000000..5008ddf
Binary files /dev/null and b/trec_documents/.DS_Store differ
diff --git a/trec_documents/docset.trectext b/trec_documents/docset.trectext
new file mode 100644
index 0000000..67ba521
--- /dev/null
+++ b/trec_documents/docset.trectext
@@ -0,0 +1,94 @@
+
+ XIE19960101.0001
+ 1996-01-01 20:06
+
+ Yearender: Jordanian Diplomacy Witnesses Great Achievements in 1995 (part two)
+
+The normalization with Egypt, an active supporter of the
+anti-Iraq coalition during the Gulf crisis, was symbolized by the
+visit of Egyptian President Hosni Mubarak to Jordan last January.
+At the same time, Jordan improved its ties with the
+Palestinians by dispelling their fears that arouse among the
+Palestinians due to Jordan's special role in Jerusalem under the
+Jordanian-Israeli accord.
+In addition, Jordan also made efforts to push forward the
+Palestinian-Israeli negotiations on expanding Palestinian autonomy
+in the West Bank.
+To its own benefits, Jordan's peace treaty with Israel and new
+attitude toward Baghdad have also brought about warmer ties with
+the Western countries, with the United States in particular.
+Among others, U.S. Vice President Al Gore, British Prime
+Minister John Major, German Chancellor Helmut Kohl, Japanese Prime
+Minister Tomiichi Murayama as well as Spanish Prime Minister
+Felipe Gonzalez, visited Amman and reiterated their countries'
+support for the kingdom in various fields.
+U.S. Secretary of State Warren Christopher came to Jordan time
+and again to meet with King Hussein during his more than a dozen
+of shuttle visits to the region to expedite the ongoing Middle
+East peace process.
+The Clinton Administration also voiced its full support, without
+reservation, to the kingdom to defend its security immediately
+after Iraqi former Industry Minister Kamel Hassan defected to
+Jordan in August, warning against a threat of Iraqi retaliation
+upon Jordan.
+In the past year, King Hussein, Crown Prince Hassan and Prime
+Minister Sharif Zeid Ben Shaker as well as other senior Jordanian
+officials traveled to London, Paris, Tokyo, Bonn and Washington,
+bringing home agreements for economic aid or military assistance
+as well as promises of support.
+Its success of hosting the second Middle East and North Africa
+economic summit in Amman, which drew the participation of 1,600
+government officials and private businessmen from 62 countries
+across the world, added great credit to the kingdom and was considered
+the crowning accomplishment of Jordan's foreign policy.
+
+
+
+
+ XIE19960101.0002
+ 1996-01-01 20:58
+
+ Yearender: Jordanian Diplomacy Witnesses Great Achievements in 1995 (by Wen Xinnian) (part one)
+
+AMMAN, January 1 (Xinhua) -- Jordan, in the past year,
+witnessed a series of remarkable achievements on its diplomatic
+front.
+In the year, the kingdom restored its territorial and water
+rights from Israel, normalized relations with the Jewish state,
+improved ties with other Arab states and moved closer to the
+Western countries, especially the United States.
+The historic Israel-Jordan peace treaty signed in October,
+1994, which ended 46 years of enmity between them, heralded a new
+era for Jordan's foreign policy and was consequently lauded as a
+model of peace in the region.
+In 1995, Jordan regained sovereignty over its territories
+occupied by Israel and exchanged ambassadors with Tel Aviv. It
+also signed a number of agreements with Israel, expanding peace to
+economic cooperation.
+Its grant of political asylum to General Hussein Kamel Hassan,
+son-in-law of President Saddam Hussein, and its growing
+estrangement from Iraq, have quickened the pace to improve its
+relations with the Gulf Arab states, which were strained because
+of Jordan's tilt toward Iraq during the 1990-1991 Gulf crisis.
+In the past year, the kingdom also saw an outstanding warm-up
+of ties with Saudi Arabia, the strongest of the six-nation Gulf
+Cooperation Council (GCC).
+Jordan's Foreign Minister Abdul-Karim Kabariti visited Riyadh
+twice during the year and was received by Saudi King Fahd on his
+second visit to the oil-rich Gulf kingdom.
+Saudi Ambassador to Jordan Abdullah Sudairi arrived in Amman
+last month and took his post five years after Saudi Arabia
+recalled its ambassador in Amman, marking the return to normal of
+the bilateral relations.
+A visit to Saudi Arabia by King Hussein and his summit meeting
+with King Fahd, which had been scheduled for early this month but
+were canceled due to Fahd's sudden illness, are expected to
+materialize soon.
+Kuwait, invaded by Iraq in 1990, has not formally announced its
+normalization with Jordan, but has been moving closer to it.
+The two countries' foreign ministers met repeatedly on the
+sidelines of many regional and international gatherings and
+agreed to conducting regular coordination and consultation.
+
+
+