diff --git a/.gitignore b/.gitignore index 894a44c..24e83e0 100644 --- a/.gitignore +++ b/.gitignore @@ -102,3 +102,11 @@ venv.bak/ # mypy .mypy_cache/ + +# IDE folders +.idea/ +.vscode/ + +# Dependencies +DrQA/ +stanford-corenlp-full-* diff --git a/.idea/.gitignore b/.idea/.gitignore deleted file mode 100644 index 0e40fe8..0000000 --- a/.idea/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ - -# Default ignored files -/workspace.xml \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml deleted file mode 100644 index 105ce2d..0000000 --- a/.idea/inspectionProfiles/profiles_settings.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - \ No newline at end of file diff --git a/.idea/macaw.iml b/.idea/macaw.iml deleted file mode 100644 index 055624e..0000000 --- a/.idea/macaw.iml +++ /dev/null @@ -1,17 +0,0 @@ - - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml deleted file mode 100644 index d1151ac..0000000 --- a/.idea/misc.xml +++ /dev/null @@ -1,7 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml deleted file mode 100644 index 7021aec..0000000 --- a/.idea/modules.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - - \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml deleted file mode 100644 index 94a25f7..0000000 --- a/.idea/vcs.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..b227825 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,78 @@ +FROM ubuntu:20.04 + +WORKDIR /usr/src/app + +# Disable interactive input. This is needed to install mongodb. +ENV DEBIAN_FRONTEND noninteractive + +RUN apt-get update && apt-get install -y \ + gnupg \ + wget \ + g++ \ + make \ + zlib1g-dev \ + software-properties-common \ + unzip \ + default-jre \ + git \ + apt-transport-https ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +RUN apt-get update && apt-get install -y python3-pip + +RUN apt update && apt install -y ffmpeg + +RUN pip3 install --upgrade pip && pip3 install torch + +# Install all dependencies mentioned in the macaw requirements document. +COPY requirements.txt requirements.txt +RUN pip3 install -r requirements.txt + +# Download Stanford core NLP data if user has not specified a local volume. This is a 400MB compressed file. +ARG download_stanford_corenlp=false +RUN if $download_stanford_corenlp ; then \ + wget -O "stanford-corenlp-full-2017-06-09.zip" "http://nlp.stanford.edu/software/stanford-corenlp-full-2017-06-09.zip" \ + && unzip "stanford-corenlp-full-2017-06-09.zip" \ + && rm "stanford-corenlp-full-2017-06-09.zip" ; fi + +# Download and install FAIR DrQA module. +RUN git clone https://github.com/facebookresearch/DrQA.git +RUN cd DrQA \ + && pip3 install -r requirements.txt \ + && python3 setup.py develop + +# Download a pre-trained DrQA model if user has not specified a local volume. This is a 7.5GB compressed file download +# and requires 25GB of uncompressed space. To save Docker image memory, it is recommended to use external file as volume. +ARG download_drqa_model=false +RUN if $download_drqa_model ; then cd DrQA && ./download.sh ; fi + +# Install MongoDB server https://docs.mongodb.com/manual/tutorial/install-mongodb-on-ubuntu/ +RUN wget -qO - https://www.mongodb.org/static/pgp/server-5.0.asc | apt-key add - +RUN echo "deb [ arch=amd64,arm64 ] https://repo.mongodb.org/apt/ubuntu focal/mongodb-org/5.0 multiverse" | tee /etc/apt/sources.list.d/mongodb-org-5.0.list +RUN apt-get update && apt-get install -y mongodb-org + +# Create the MongoDB data directory. +RUN mkdir -p /data/db + +# Copy directory that contains trectext documents needed for Indri retriever. +COPY trec_documents trec_documents + +# Copy files and directories from workspace to Docker container. +COPY macaw macaw +COPY scripts scripts +COPY setup.py setup.py + +ENV PYTHONPATH="$PYTHONPATH:/usr/src/app" + +# Install Macaw. +RUN python3 setup.py install + +# To fix async keyword issue in python3.7+ https://github.com/pexpect/pexpect/issues/453 +RUN pip3 install -Iv pexpect==4.8.0 + +# Create index +RUN mkdir tantivy_index/ +RUN python3 macaw/build_tantivy_index.py --index_path tantivy_index/ --document_path trec_documents/ + +# Run the script that will start MongoDB and run python application. +CMD ["/bin/bash", "scripts/start.sh"] diff --git a/README.md b/README.md index c6e6d06..add050d 100644 --- a/README.md +++ b/README.md @@ -1,30 +1,33 @@ # Macaw: An Extensible Conversational Information Seeking Platform + Conversational information seeking (CIS) has been recognized as a major emerging research area in information retrieval. -Such research will require data and tools, to allow the implementation and study of conversational systems. Macaw is -an open-source framework with a modular architecture for CIS research. Macaw supports *multi-turn*, *multi-modal*, and -*mixed-initiative* interactions, for tasks such as document retrieval, question answering, recommendation, and -structured data exploration. It has a modular design to encourage the study of new CIS algorithms, which can be -evaluated in batch mode. It can also integrate with a user interface, which allows user studies and data collection in -an interactive mode, where the back end can be *fully algorithmic* or a *wizard of oz* setup. - -Macaw could be of interest to the researchers and practitioners working on information retrieval, natural language +Such research will require data and tools, to allow the implementation and study of conversational systems. Macaw is an +open-source framework with a modular architecture for CIS research. Macaw supports _multi-turn_, _multi-modal_, and +_mixed-initiative_ interactions, for tasks such as document retrieval, question answering, recommendation, and +structured data exploration. It has a modular design to encourage the study of new CIS algorithms, which can be +evaluated in batch mode. It can also integrate with a user interface, which allows user studies and data collection in +an interactive mode, where the back end can be _fully algorithmic_ or a _wizard of oz_ setup. + +Macaw could be of interest to the researchers and practitioners working on information retrieval, natural language processing, and dialogue systems. For more information on Macaw, please refer to [this paper](https://arxiv.org/pdf/1912.08904.pdf). Table of content: -+ [Macaw Architecture](#macaw-architecture) - + [Interfaces](#interfaces) - + [Retrieval](#retrieval) - + [Answer Selection and Generation](#answer-selection-and-generation) -+ [Installation](#installation) -+ [Running Macaw](#running-macaw) -+ [Bug Report and Feature Request](#bug-report-and-feature-request) -+ [Citation](#citation) -+ [License](#license) -+ [Contribution](#contribution) + +- [Macaw Architecture](#macaw-architecture) + - [Interfaces](#interfaces) + - [Retrieval](#retrieval) + - [Answer Selection and Generation](#answer-selection-and-generation) +- [Installation](#installation) +- [Running Macaw](#running-macaw) +- [Bug Report and Feature Request](#bug-report-and-feature-request) +- [Citation](#citation) +- [License](#license) +- [Contribution](#contribution) ## Macaw Architecture + Macaw has a modular architecture, which allows further development and extension. The high-level architecture of Macaw is presented below: @@ -33,88 +36,212 @@ is presented below: For more information on each module in Macaw, refer to this paper. #### Interfaces + Macaw supports the following interfaces: -+ Standard IO: For *development* purposes -+ File IO: For *batch experiments* (see the examples in the `data` folder for input and output file formats) -+ Telegram bot: For interaction with real users + +- Standard IO: For _development_ purposes +- File IO: For _batch experiments_ (see the examples in the `data` folder for input and output file formats) +- Telegram bot: For interaction with real users Here is an example of the Telegram interface for Macaw. It supports multi-modal interactions (text, speech, click, etc). -![Telegram interface for Macaw](macaw-example-tax.jpg) +![Telegram interface for Macaw](macaw-example-tax.jpg) ![Telegram interface for Macaw](macaw-example-shakespeare.jpg) - #### Retrieval + Macaw features the following search engines: -+ [Indri](http://lemurproject.org/indri.php): an open-source search engine that can be used for any arbitrary text -collection. -+ Bing web search API: sending a request to the Bing API and getting the results. + +- [Tantivy](https://github.com/quickwit-oss/tantivy): Tantivy is a full-text search engine library written in Rust. +- Bing web search API: sending a request to the Bing API and getting the results. #### Answer Selection and Generation -For question answering, Macaw only features [the DrQA model](https://github.com/facebookresearch/DrQA) in its current + +For question answering, Macaw only features [the DrQA model](https://github.com/facebookresearch/DrQA) in its current version. +## Installation and running with Docker -## Installation -Macaw requires `Python >= 3.6` and `pip3`. If you don't have `setuptools`, run `sudo pip3 install setuptools`. -To install Macaw, first **clone macaw** from this repo and then follow the following installation steps. The -mentioned installation commands can be executed on Ubuntu. You can use the same or similar commands on other Linux -distribution. If you are using Windows 10, we recommend installing Macaw and all the required packages on -[Windows Subsystem for Linux](https://docs.microsoft.com/en-us/windows/wsl/install-win10). +The package has been tested with certain dependencies and it is much easier to reproduce it in a similar environment. It +has been integrated with Docker to make it compatible with all operating systems. The default Docker setup runs the +application using the Standard IO interface, uses Tantivy for document retrieval, and DrQA for MRC (answer selection). To +run using other settings, appropriate changes should be done. -#### Step 1: Installing MongoDB server -Macaw uses MongoDB for storing and retrieving user interactions (conversations). To install MongoDB server, run the -following command: +The first step is to install [Docker](https://docs.docker.com/engine/install/) in your system. Then continue with the +below steps. + +### Create the build + +To reduce the size of the build, we can keep certain data outside the Docker container and mount it +using [volumes](https://docs.docker.com/storage/volumes/). + +1. Download the Stanford Core NLP data + from [here](http://nlp.stanford.edu/software/stanford-corenlp-full-2017-06-09.zip) and put the + directory `stanford-corenlp-full-2017-06-09` in your project root directory. +1. Install [DrQA](https://github.com/facebookresearch/DrQA) in a separate workspace and download the pre-trained model. + It stores the models in `data/reader/` directory. We will use the downloaded _multitask.mdl_ model. + +Once you have the two downloads done, run the below command from project root to create a docker build with name _macaw_: + +```commandline +docker build -t macaw . ``` -sudo apt-get install mongodb-server-core + +If you don't want to pre-install DrQA model and Stanford Core NLP data, create the build using the below command. It +will install both dependencies for you and keep them as part of the build. Note that this will significantly increase +the build size (by ~400MB for Stanford CoreNLP and by ~7.5GB for DrQA). + +```commandline +docker build --build-arg download_stanford_corenlp=true --build-arg download_drqa_model=true -t macaw . +``` + +_Note: To make sure that the Docker container builds without modification, an x86_64/amd64 system is required. If you have an arm64 device, then add the flag `--platform linux/amd64` to the build command._ + +### Run the application + +If you downloaded certain data locally, then use Docker volumes to mount local directory to Docker container. You need +to provide the local directory path during runtime. Run the command from project root. + +```commandline +docker run --rm -i --name=macaw_test_container \ +-v :/usr/src/app/DrQA/data \ +-v $("pwd")/stanford-corenlp-full-2017-06-09:/usr/src/app/stanford-corenlp-full-2017-06-09 \ +macaw +``` + +`` could be `/Users/amitgh/PycharmProjects/DrQA/data` if you downloaded the pre-trained model in a +separate workspace named DrQA. + +If you did not separately download data at build time, simply run: + +```commandline +docker run --rm -i --name=macaw_test_container macaw +``` + +In above command we start a container with name _macaw_test_container_ from build image _macaw_ in interactive +mode (`-i`) +and remove the container when the application exits (`--rm`). After installing all dependencies, it +runs `scripts/start.sh` +which first starts MongoDB server in a separate thread and then runs `live_main.py`. + +_Note: Similar to requiring the additional flag of `--platform linux/amd64` to build the Docker container with an arm64 machine, running said container also requires the same flag. +:warning: **The performance of the container under this emulation will be incredibly poor. If possible, use a x86_64/amd64 system**._ + +#### Run with file input + +To avoid typing the input every time, you can provide an input file and get output in an output file. We need to mount +the directory containing the data. + +```commandline +docker build -t macaw . && docker run --rm -i --name=macaw_test_container \ +-v :/usr/src/app/DrQA/data \ +-v $("pwd")/stanford-corenlp-full-2017-06-09:/usr/src/app/stanford-corenlp-full-2017-06-09 \ +-v $("pwd")/data:/usr/src/app/data \ +macaw +``` + +Also update the command inside `scripts/start.sh` file to +```commandline +python3 macaw/live_main.py --mode exp --interface fileio +``` + +### ssh into the container + +While the application is running, we can go inside the container to see the contents (directory structure, Tantivy index, +etc.). + +```commandline +docker exec -it macaw_test_container /bin/bash +``` + +### Updating TREC data for Tantivy + +Tantivy index is created using the document stored in `trec_documents/` directory. It has some default data. To create a +bigger index, download the entire data from [archive](https://archive.org/details/trec-ir) and put it in trec_documents. +Docker will copy it during build time and create a new index. + +## Running entire Macuna application + +Using `docker compose` we can start the main application and all other supporting docker containers (nlp pipeline +applications and remote modules) at once. This does not work with stdio mode as docker compose does not support +terminal input. Run the below command. + +```commandline +docker compose build && docker compose up ``` -#### Step 2: Installing Indri and Pyndri -[Indri](http://lemurproject.org/indri.php) is an open-source search engine for information retrieval research, -implemented as part of the [Lemur Project](http://lemurproject.org/). -[Pyndri](https://github.com/cvangysel/pyndri) is a python interface to Indri. Macaw uses Indri for retrieving documents -from an arbitrary text collection. -To install Indri, first download Indri from https://sourceforge.net/projects/lemur/files/lemur/. As suggested by pyndri, -we have used Indri-5.11. This Indri version can be installed as follows: +To run different containers independently or to support terminal input, run the below commands in order. + +First, build the application. + +```commandline +docker compose build ``` -# download indri-5.11.tar.gz -sudo apt install g++ zlib1g-dev -tar xzvf indri-5.11.tar.gz -rm indri-5.11.tar.gz -cd indri-5.11 -./configure CXX="g++ -D_GLIBCXX_USE_CXX11_ABI=0" -make -sudo make install + +Second, start all the supporting remote modules. Make sure to explicitly provide port and container names. This can be +found from the `docker-compose.yml` file. + +```commandline +docker compose run --rm -p "127.0.0.1:8001:80" --name nlp-pipeline-app-flask nlp-pipeline-app-flask +docker compose run --rm -p "127.0.0.1:8002:80" --name nlp-pipeline-app-ic nlp-pipeline-app-ic +docker compose run --rm -p "127.0.0.1:8003:80" --name response-generator-app-qa response-generator-app-qa +``` + +Third, run the main application which has stdio (or it can also have fileio). For stdio update the flags in `start.sh` +with `--mode live --interface stdio`. + +```commandline +docker compose run --rm base-app ``` -Then, clone the pyndri repository from https://github.com/cvangysel/pyndri and run the following command: +## Local Setup + +To setup the package locally without using Docker, follow the below instructions. + +### Installation + +Macaw requires `Python >= 3.5` and `pip3`. If you don't have `setuptools`, run `sudo pip3 install setuptools`. To +install Macaw, first **clone macaw** from this repo and then follow the following installation steps. The mentioned +installation commands can be executed on Ubuntu. You can use the same or similar commands on other Linux distribution. +If you are using Windows 10, we recommend installing Macaw and all the required packages on +[Windows Subsystem for Linux](https://docs.microsoft.com/en-us/windows/wsl/install-win10). + +#### Step 1: Installing MongoDB server + +Macaw uses MongoDB for storing and retrieving user interactions (conversations). To install MongoDB server, run the +following command: + ``` -python3 setup.py install +sudo apt-get install mongodb-server-core ``` At this step, you can make sure your installation is complete by running the pyndri tests. -#### Step 3: Installing Stanford Core NLP -Stanford Core NLP can be used for tokenization and most importantly for co-reference resolution. If you do not need -co-reference resolution, you can ignore this step. Stanford Core NLP requires `java`. Get it by following these +#### Step 2: Installing Stanford Core NLP + +Stanford Core NLP can be used for tokenization and most importantly for co-reference resolution. If you do not need +co-reference resolution, you can ignore this step. Stanford Core NLP requires `java`. Get it by following these commands: + ``` wget -O "stanford-corenlp-full-2017-06-09.zip" "http://nlp.stanford.edu/software/stanford-corenlp-full-2017-06-09.zip" sudo apt-get install unzip unzip "stanford-corenlp-full-2017-06-09.zip" rm "stanford-corenlp-full-2017-06-09.zip" -``` +``` If you don't have `java`, install it using: + ``` sudo apt-get install default-jre ``` -#### Step 4: Installing DrQA -Macaw also supports answer extraction / generation for user queries from retrieved documents. For this purpose, it +#### Step 3: Installing DrQA + +Macaw also supports answer extraction / generation for user queries from retrieved documents. For this purpose, it features [DrQA](https://github.com/facebookresearch/DrQA). If you do not need this functionality, ignore this step (you -can also install this later). -To install DrQA, run the following commands: +can also install this later). To install DrQA, run the following commands: + ``` git clone https://github.com/facebookresearch/DrQA.git cd DrQA @@ -123,47 +250,54 @@ pip3 install torch sudo python3 setup.py develop ``` -To use pre-trained DrQA model, use the following command. +To use pre-trained DrQA model, use the following command. + ``` ./download.sh ``` + This downloads a 7.5GB (compressed) file and requires 25GB (uncompressed) space. This may take a while! - +#### Step 4: Installing FFmpeg + +To support speech interactions with users, Macaw requires FFmpeg for some multimedia processing steps. If you don't need +a speech support from Macaw, you can skip this step. To install FFmpeg, run the following command: -#### Step 5: Installing FFmpeg -To support speech interactions with users, Macaw requires FFmpeg for some multimedia processing steps. If you don't -need a speech support from Macaw, you can skip this step. To install FFmpeg, run the following command: ``` -sudo apt-get install +sudo apt-get install ``` -#### Step 6: Installing Macaw +#### Step 5: Installing Macaw + After cloning Macaw, use the following commands for installation: + ``` cd macaw sudo pip3 install -r requirements.txt sudo python3 setup.py install ``` -## Running Macaw +### Running Macaw + If you run macaw with interactive (or live) mode, you should first run MongoDB server using the following command: + ``` sudo mongod ``` -Note that this command uses the default database directory (`/data/db`) for storing the data. You may need to create -this directory if you haven't. You can also use other locations using the `--dbpath` argument. +Note that this command uses the default database directory (`/data/db`) for storing the data. You may need to create +this directory if you haven't. You can also use other locations using the `--dbpath` argument. We provide three different main scripts (i.e., app): -+ `live_main.py`: An interactive conversational search and question answering system. It can use both STDIO and Telegram -interfaces. -+ `batch_ext_main.py`: A model for running experiments on a reusable dataset. This main script uses FILEIO as the -interface. -+ `wizard_of_oz_main.py`: A main script for Wizard of Oz experiments. - + +- `live_main.py`: An interactive conversational search and question answering system. It can use both STDIO and Telegram + interfaces. +- `batch_ext_main.py`: A model for running experiments on a reusable dataset. This main script uses FILEIO as the + interface. +- `wizard_of_oz_main.py`: A main script for Wizard of Oz experiments. + After selecting the desired main script, open the python file and provide the required parameters. For example, you need -to use your Bing subscription key (if using Bing), the path to Indri index (if using Indri), Telegram bot token (if +to use your Bing subscription key (if using Bing), the path to Tantivy index, Telegram bot token (if using Telegram interface), etc. in order to run the `live_main.py` script. You can further run the favorite main script as below: @@ -171,18 +305,21 @@ as below: python3 live_main.py ``` - ## Bug Report and Feature Request -For bug report and feature request, you can open an issue in github, or send an email to + +For bug report and feature request, you can open an issue in github, or send an email to [Hamed Zamani](http://hamedz.ir) at `hazamani@microsoft.com`. ## Citation + If you found Macaw useful, you can cite the following article: + ``` Hamed Zamani and Nick Craswell, "Macaw: An Extensible Conversational Information Seeking System", arxiv pre-print. ``` bibtex: + ``` @article{macaw, title={Macaw: An Extensible Conversational Information Seeking Platform}, @@ -193,18 +330,18 @@ bibtex: ``` ## License -Macaw is distributed under the **MIT License**. See the `LICENSE` file for more information. +Macaw is distributed under the **MIT License**. See the `LICENSE` file for more information. ## Contribution -This project welcomes contributions and suggestions. Most contributions require you to agree to a -Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us -the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. +This project welcomes contributions and suggestions. Most contributions require you to agree to a Contributor License +Agreement (CLA) declaring that you have the right to, and actually do, grant us the rights to use your contribution. For +details, visit https://cla.opensource.microsoft.com. -When you submit a pull request, a CLA bot will automatically determine whether you need to provide -a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions -provided by the bot. You will only need to do this once across all repos using our CLA. +When you submit a pull request, a CLA bot will automatically determine whether you need to provide a CLA and decorate +the PR appropriately (e.g., status check, comment). Simply follow the instructions provided by the bot. You will only +need to do this once across all repos using our CLA. This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or diff --git a/data/file_input.txt b/data/file_input.txt new file mode 100644 index 0000000..5a3ab77 --- /dev/null +++ b/data/file_input.txt @@ -0,0 +1,4 @@ +q1 who is the president of Japan? +q2 when did the gulf war start? +q3 what rights did the kingdom restore with israel? +q4 when did jordan regained sovereignty over its territories? \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..db68cec --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,34 @@ +version: '3.4' +services: + base-app: + build: ./ + ports: + - "127.0.0.1:8000:80" + volumes: + - "/Users/amitgh/PycharmProjects/DrQA/data:/usr/src/app/DrQA/data" + - "/Users/amitgh/PycharmProjects/Maruna/macaw/stanford-corenlp-full-2017-06-09:/usr/src/app/stanford-corenlp-full-2017-06-09" + - "/Users/amitgh/PycharmProjects/Maruna/macaw/data:/usr/src/app/data" + deploy: + resources: + limits: + memory: 5gb + nlp-pipeline-app-flask: + build: ./macaw/docker/flask_app + ports: + - "127.0.0.1:8001:80" + deploy: + resources: + limits: + memory: 100mb + nlp-pipeline-app-ic: + build: ./macaw/docker/ic_app + ports: + - "127.0.0.1:8002:80" + response-generator-app-qa: + build: ./macaw/docker/qa_app + ports: + - "127.0.0.1:8003:80" + deploy: + resources: + limits: + memory: 100mb diff --git a/macaw/batch_exp_main.py b/macaw/batch_exp_main.py index 9e61b96..8cad6b3 100644 --- a/macaw/batch_exp_main.py +++ b/macaw/batch_exp_main.py @@ -6,7 +6,7 @@ from macaw.cis import CIS from macaw.core import mrc, retrieval -from macaw.core.input_handler.action_detection import RequestDispatcher +from macaw.core.response.action_detection import RequestDispatcher from macaw.core.output_handler import naive_output_selection diff --git a/macaw/build_tantivy_index.py b/macaw/build_tantivy_index.py new file mode 100644 index 0000000..e584c7f --- /dev/null +++ b/macaw/build_tantivy_index.py @@ -0,0 +1,55 @@ +import argparse +import os +from typing import List + +import tantivy + +from macaw.core.retrieval.doc import get_trec_doc + + +def get_trec_docs(documents_path: str) -> List[str]: + # can be optimized + doc_list = [] + raw_body_list = [] + for trec_files in os.listdir(documents_path): + if not trec_files.startswith('.'): + with open(os.path.join(documents_path, trec_files), 'r', encoding="utf-8") as fobj: + trec_txt = fobj.read() + raw_body_list.append(trec_txt) + doc_list.append(get_trec_doc(trec_txt)) + return doc_list, raw_body_list + + +def main(index_path, documents_path): + # build the schema + schema_builder = tantivy.SchemaBuilder() + schema_builder.add_text_field("body", stored=True) + schema_builder.add_unsigned_field("doc_id", stored=True) + schema = schema_builder.build() + # create index + index = tantivy.Index(schema, path=index_path) + # read all trec doc + documents, raw_docs = get_trec_docs(documents_path) + # add documents + print('Building sparse index of {} docs...'.format(len(documents))) + writer = index.writer() + for i, doc in enumerate(raw_docs): + writer.add_document(tantivy.Document( + body=[doc], # this is the raw text of the trec document + doc_id=i + )) + if (i + 1) % 100000 == 0: + writer.commit() + print('Indexed {} docs'.format(i + 1)) + writer.commit() + print('Built sparse index') + index.reload() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Index documents') + parser.add_argument('--index_path', type=str, help='path to store the index') + parser.add_argument('--document_path', type=str, help='path for documents to index') + args = parser.parse_args() + + main(args.index_path, args.document_path) diff --git a/macaw/cis.py b/macaw/cis.py index a36ab54..c2a0f5d 100644 --- a/macaw/cis.py +++ b/macaw/cis.py @@ -1,15 +1,19 @@ -""" -The CIS class. - -Authors: Hamed Zamani (hazamani@microsoft.com) -""" - from abc import ABC, abstractmethod +from datetime import datetime +from typing import List +import logging + from func_timeout import FunctionTimedOut +from core.response.handler import ResponseGeneratorHandler from macaw import interface, util -from macaw.core.interaction_handler.user_requests_db import InteractionDB +from macaw.core.dialogue_manager.dialogue_manager import DialogManager +from macaw.core.response.action_detection import RequestDispatcher +from macaw.core.interaction_handler import CurrentAttributes from macaw.core.interaction_handler.msg import Message +from macaw.core.interaction_handler.user_requests_db import InteractionDB +from macaw.core.nlp_pipeline.nlp_pipeline import NlpPipeline +from macaw.core.output_handler import naive_output_selection class CIS(ABC): @@ -22,75 +26,83 @@ def __init__(self, params): params(dict): A dict containing some parameters. """ self.params = params - if params['mode'] == 'live': - self.params['live_request_handler'] = self.live_request_handler - self.msg_db = InteractionDB(host=self.params['interaction_db_host'], - port=self.params['interaction_db_port'], - dbname=self.params['interaction_db_name']) - elif params['mode'] == 'exp': - self.params['experimental_request_handler'] = self.request_handler_func + if params["mode"] == "live": + self.params["live_request_handler"] = self.live_request_handler + elif params["mode"] == "exp": + self.params["experimental_request_handler"] = self.file_request_handler + + self.msg_db = InteractionDB( + host=self.params["interaction_db_host"], + port=self.params["interaction_db_port"], + dbname=self.params["interaction_db_name"], + ) + self.logger = logging.getLogger("MacawLogger") + self.params["curr_attrs"] = self.curr_attrs = CurrentAttributes() + self.params["actions"] = self.generate_actions() self.interface = interface.get_interface(params) + self.request_dispatcher = RequestDispatcher(self.params) + self.output_selection = naive_output_selection.NaiveOutputProcessing({}) + self.dialogue_manager = DialogManager() + self.nlp_pipeline = NlpPipeline(params.get("nlp_models", {})) + self.response_generator_handler = ResponseGeneratorHandler(params.get("response_generator_models", {})) + try: self.nlp_util = util.NLPUtil(self.params) - self.params['nlp_util'] = self.nlp_util + self.params["nlp_util"] = self.nlp_util except Exception as ex: - self.params['logger'].warning('WARNING: There is a problem with setting up the NLP utility module.') - self.timeout = self.params['timeout'] if 'timeout' in self.params else -1 + self.logger.warning( + f"There is a problem with setting up the NLP utility module. {type(ex)}, {ex}" + ) + self.timeout = self.params["timeout"] if "timeout" in self.params else -1 - def live_request_handler(self, msg): + def live_request_handler(self, msg: Message): try: - # load conversation from the database and add the current message to the database - conv = [msg] + self.msg_db.get_conv_history(user_id=msg.user_id, max_time=10 * 60 * 1000, max_count=10) - self.msg_db.insert_one(msg) + # load conversation from the database. + history = self.msg_db.get_conv_history( + user_id=msg.user_id, max_time=10*60*1000, max_count=10 + ) + + conv = [msg] + history # output_msg = func_timeout(self.timeout, self.request_handler_func, args=[conv]) output_msg = self.request_handler_func(conv) + + # Save the output and conversation state in DB. self.msg_db.insert_one(output_msg) return output_msg except FunctionTimedOut: + print(f"live_request_handler method timed out.") msg_info = dict() - msg_info['msg_id'] = msg.msg_info['msg_id'] - msg_info['msg_source'] = 'system' - msg_info['msg_type'] = 'error' - text = 'Time out, no result!' - timestamp = util.current_time_in_milliseconds() - error_msg = Message(msg.user_interface, msg.user_id, msg.user_info, msg_info, text, timestamp) + msg_info["msg_id"] = msg.msg_info["msg_id"] + msg_info["msg_source"] = "system" + msg_info["msg_type"] = "error" + text = "Time out, no result!" + timestamp = datetime.utcnow() + error_msg = Message( + user_interface=msg.user_interface, + user_id=msg.user_id, + user_info=msg.user_info, + msg_info=msg_info, + text=text, + timestamp=timestamp, + ) self.msg_db.insert_one(error_msg) return error_msg - # def experimental_request_handler(self, str_list): - # if not isinstance(str_list, list): - # raise Exception('The input should be a list!') - # - # conv_list = [] - # for i in range(len(str_list)): - # if not isinstance(str_list[i], str): - # raise Exception('Each element of the input should be a string!') - # user_info = {'first_name': 'NONE'} - # msg_info = {'msg_id': -1, - # 'msg_type': 'command' if str_list[i].startswith('#') else 'text', - # 'msg_source': 'user'} - # msg = Message(user_interface='NONE', - # user_id=-1, - # user_info=user_info, - # msg_info=msg_info, - # text=str_list[i], - # timestamp=util.current_time_in_milliseconds()) - # conv_list.append(msg) - # conv_list.reverse() - # - # if self.timeout > 0: - # output_msg = func_timeout(self.timeout, self.request_handler_func, args=[conv_list]) - # else: - # output_msg = self.request_handler_func(conv_list) - # return output_msg + def file_request_handler(self, msg: Message): + # Just delegate the call to the live request handler. + return self.live_request_handler(msg) @abstractmethod - def request_handler_func(self, conv_list): + def request_handler_func(self, conv_list: List[Message]) -> Message: pass @abstractmethod def run(self): - pass \ No newline at end of file + pass + + @abstractmethod + def generate_actions(self) -> dict: + pass diff --git a/macaw/core/input_handler/__init__.py b/macaw/core/dialogue_manager/__init__.py similarity index 100% rename from macaw/core/input_handler/__init__.py rename to macaw/core/dialogue_manager/__init__.py diff --git a/macaw/core/dialogue_manager/dialogue_manager.py b/macaw/core/dialogue_manager/dialogue_manager.py new file mode 100644 index 0000000..d64f48f --- /dev/null +++ b/macaw/core/dialogue_manager/dialogue_manager.py @@ -0,0 +1,29 @@ +from core.dialogue_manager.dst import DST, State + + +class DialogManager(object): + def __init__(self, dst=None): + self.dst = DST() if dst is None else dst + + @classmethod + def decode(cls, encoded: dict): + return DialogManager(DST.decode(encoded["dst"])) + + def process_turn(self, nlp_pipeline_output: dict): + # Get these from the nlp_pipeline output. + input_alphabet = None + context = None + + new_state = self.dst.transition(input_alphabet, context) + if isinstance(new_state, State): + self.dst.update(new_state) + print(f"new_state={new_state}") + else: + print(f"Couldn't change state: {new_state}") + pass + # Save output result in conversation context. + + def encode(self) -> dict: + return { + "dst": self.dst.encode() + } diff --git a/macaw/core/dialogue_manager/dst.py b/macaw/core/dialogue_manager/dst.py new file mode 100644 index 0000000..142c60f --- /dev/null +++ b/macaw/core/dialogue_manager/dst.py @@ -0,0 +1,42 @@ +import enum + +from typing import Union + + +class State(enum.Enum): + launch = 1 + recipe_selection = 2 + ingredients = 3 + show_steps = 4 + goodbye = 5 + + +class DST: + def __init__(self, state=None): + self.curr_state = State.launch if state is None else state + + @classmethod + def decode(cls, encoded: dict): + return DST(State(encoded["curr_state"])) + + def transition(self, input_alphabet, context) -> Union[str, State]: + """ + Given the input alphabet and context (and thing that tells about the ongoing conversation), it returns what + should the destination state be. Returns message str if there is no valid transition. + """ + if self.curr_state == State.launch: + if input_alphabet == 'recipe_question': + return State.recipe_selection + elif self.curr_state == State.recipe_selection: + if input_alphabet == 'option_selection': + return State.ingredients + return f"Could not make a transition from {self.curr_state} using input {input_alphabet}." + + def update(self, new_state: State): + self.curr_state = new_state + + def encode(self) -> dict: + return { + "curr_state": self.curr_state.value + } + diff --git a/macaw/core/interaction_handler/__init__.py b/macaw/core/interaction_handler/__init__.py index 70bb3a1..0bc32cc 100644 --- a/macaw/core/interaction_handler/__init__.py +++ b/macaw/core/interaction_handler/__init__.py @@ -1,5 +1,8 @@ """ -FILE DESCRIPTION +The interaction handler init. -Authors: Hamed Zamani (hazamani@microsoft.com) -""" \ No newline at end of file +Authors: Hamed Zamani (hazamani@microsoft.com), George Wei (gzwei@umass.edu) +""" +from .attributes import CurrentAttributes, UserAttributes +from .msg import Message +from .user_requests_db import InteractionDB diff --git a/macaw/core/interaction_handler/attributes.py b/macaw/core/interaction_handler/attributes.py new file mode 100644 index 0000000..77c2040 --- /dev/null +++ b/macaw/core/interaction_handler/attributes.py @@ -0,0 +1,23 @@ +""" +The attribute singleton classes. + +Authors: George Wei (gzwei@umass.edu) +""" + + +class Singleton: + def __init__(self, **kwargs): + self.__dict__.update(kwargs) + + def __new__(self, **kwargs): + if not hasattr(self, "instance"): + self.instance = super().__new__(self, **kwargs) + return self.instance + + +class CurrentAttributes(Singleton): + pass + + +class UserAttributes(Singleton): + pass diff --git a/macaw/core/interaction_handler/msg.py b/macaw/core/interaction_handler/msg.py index 1ae4d28..a186887 100644 --- a/macaw/core/interaction_handler/msg.py +++ b/macaw/core/interaction_handler/msg.py @@ -1,44 +1,88 @@ """ -The message used to represent each interaction in Macaw. +The message used to represent each interaction in Macaw. It could be either from the user side or from the bot side. -Authors: Hamed Zamani (hazamani@microsoft.com) +Authors: Hamed Zamani (hazamani@microsoft.com), George Wei (gzwei@umass.edu) """ +from datetime import datetime +from typing import Optional, Union, Dict + +from .attributes import UserAttributes +from ..dialogue_manager.dialogue_manager import DialogManager class Message: - def __init__(self, user_interface, user_id, user_info, msg_info, text, timestamp): + def __init__( + self, + user_interface: str, + user_id: Union[str, int], + text: str, + timestamp: datetime, + response: Optional[str] = None, + user_info: Optional[Dict[str, any]] = None, + msg_info: Optional[Dict[str, any]] = None, + actions_result: Optional[Dict[str, any]] = None, + dialog_manager: Optional[DialogManager] = None, + nlp_pipeline_result: Optional[Dict[str, any]] = None, + user_attributes: Optional[Dict[str, any]] = None, + ): """ An object for input and output Message. Args: user_interface(str): The interface name used for this message (e.g., 'telegram') - user_id(str or int): The user ID. - user_info(dict): The dict containing some more information about the user. - msg_info(dict): The dict containing some more information about the message. - text(str): The message text. - timestamp(int): The timestamp of message in milliseconds. + user_id(str | int): The user ID. + text(str): The user message text. + response(str): (Optional) The bot message text. None indicates bot response hasn't been generated yet. + timestamp(datetime.datetime): The timestamp of message. + user_info(dict): (Optional) The dict containing some more information about the user. + msg_info(dict): (Optional) The dict containing some more information about the message. + actions_result(dict): (Optional) The results from the various actions given the conversation history. + dialog_manager(DialogManager): (Optional) The dialog manager. + nlp_pipeline_result(dict): (Optional) The results from running the NLP pipeline. + user_attributes(dict): (Optional) The user attributes for this given message. """ + self.user_interface = user_interface self.user_id = user_id - self.user_info = user_info - self.msg_info = msg_info self.text = text + self.response = response self.timestamp = timestamp - self.user_interface = user_interface + self.user_info = user_info + self.msg_info = msg_info + self.actions_result = actions_result + self.dialog_manager = dialog_manager + self.nlp_pipeline_result = nlp_pipeline_result + + if user_attributes is None: + user_attributes = dict() + + self.user_attributes = UserAttributes(**user_attributes) @classmethod def from_dict(cls, msg_dict): """ - Get a Message object from dict. + Get a Message object from serialized dict obtained from database (e.g. MongoDB). Args: msg_dict(dict): A dict containing all the information required to construct a Message object. Returns: A Message object. """ - user_interface = msg_dict['user_interface'] if 'user_interface' in msg_dict else None - user_id = msg_dict['user_id'] if 'user_id' in msg_dict else None - user_info = msg_dict['user_info'] if 'user_info' in msg_dict else None - msg_info = msg_dict['msg_info'] if 'msg_info' in msg_dict else None - text = msg_dict['text'] if 'text' in msg_dict else None - timestamp = msg_dict['timestamp'] if 'timestamp' in msg_dict else None - return cls(user_interface, user_id, user_info, msg_info, text, timestamp) \ No newline at end of file + # Deserialize custom variables of Message class. + msg_dict["dialog_manager"] = DialogManager.decode(msg_dict["dialog_manager"]) + + return cls(**msg_dict) + + def __iter__(self): + """ + This iterable is needed so that Message object can be inserted into a MongoDB table as a dictionary. It should + provide encoding logic for all custom data types used inside Message class. + """ + for attr, value in self.__dict__.items(): + value_ret = value + if isinstance(value, UserAttributes): + value_ret = value.__dict__ + elif isinstance(value, DialogManager): + value_ret = value.encode() + elif isinstance(value, datetime): + value_ret = str(value) + yield attr, value_ret diff --git a/macaw/core/interaction_handler/user_requests_db.py b/macaw/core/interaction_handler/user_requests_db.py index bbb9342..dfab257 100644 --- a/macaw/core/interaction_handler/user_requests_db.py +++ b/macaw/core/interaction_handler/user_requests_db.py @@ -1,12 +1,14 @@ """ The conversation (or interaction) database implemented using MongoDB. -Authors: Hamed Zamani (hazamani@microsoft.com) +Authors: Hamed Zamani (hazamani@microsoft.com), George Wei (gzwei@umass.edu) """ +from datetime import datetime, timedelta +from typing import List + from pymongo import MongoClient -from macaw import util from macaw.core.interaction_handler.msg import Message @@ -14,23 +16,35 @@ class InteractionDB: def __init__(self, host, port, dbname): self.client = MongoClient(host, port) self.db = self.client[dbname] - self.col = self.db['macaw_msgs'] + self.col = self.db["macaw_msgs"] def insert_one(self, msg): - if msg.user_id is None or msg.text is None or msg.timestamp is None or msg.user_interface is None: - raise Exception('Each message should include a user_interface, user_id, text, and timestamp.') - self.col.insert_one(msg.__dict__) + self.col.insert_one(dict(msg)) + + def update_one(self, user_id, updates): + self.col.find_one_and_update( + {"user_id": user_id}, updates, sort=[("timestamp", -1)] + ) def get_all(self): - print('Using get_all is only recommended for development purposes. It is not efficient!') + print( + "Using get_all is only recommended for development purposes. It is not efficient!" + ) return self.dict_list_to_msg_list(self.col.find({})) - def get_conv_history(self, user_id, max_time, max_count): + def get_conv_history(self, user_id, max_time, max_count) -> List[Message]: if max_time is None: - res = self.col.find({'user_id': user_id}).sort([('timestamp', -1)]) + res = self.col.find({"user_id": user_id}, sort=[("timestamp", -1)]) else: - res = self.col.find({'user_id': user_id, - 'timestamp': {'$gt': util.current_time_in_milliseconds() - max_time}}).sort([('timestamp', -1)]) + res = self.col.find( + { + "user_id": user_id, + "timestamp": { + "$gt": datetime.utcnow() - timedelta(minutes=max_time) + }, + }, + sort=[("timestamp", -1)], + ) if max_count is not None: res = res.limit(max_count) @@ -40,9 +54,10 @@ def close(self): self.client.close() @staticmethod - def dict_list_to_msg_list(msg_dict_list): - return [Message.from_dict(msg_dict) for msg_dict in msg_dict_list] - - - - + def dict_list_to_msg_list(msg_dict_list) -> List[Message]: + msg_list = [] + for msg_dict in msg_dict_list: + msg_dict.pop("_id") + message = Message.from_dict(msg_dict=msg_dict) + msg_list.append(message) + return msg_list diff --git a/macaw/core/mrc/__init__.py b/macaw/core/mrc/__init__.py index a56086b..587747a 100644 --- a/macaw/core/mrc/__init__.py +++ b/macaw/core/mrc/__init__.py @@ -3,6 +3,7 @@ Authors: Hamed Zamani (hazamani@microsoft.com) """ +import logging from macaw.core.mrc import drqa_mrc @@ -17,7 +18,8 @@ def get_mrc_model(params): Returns: An MRC object for machine reading comprehension. """ - params['logger'].info('The MRC model for QA: ' + params['mrc']) + logger = logging.getLogger("MacawLogger") + logger.info('The MRC model for QA: ' + params['mrc']) if params['mrc'] == 'drqa': return drqa_mrc.DrQA(params) else: diff --git a/macaw/core/mrc/drqa_mrc.py b/macaw/core/mrc/drqa_mrc.py index 8c72659..b3e5c4a 100644 --- a/macaw/core/mrc/drqa_mrc.py +++ b/macaw/core/mrc/drqa_mrc.py @@ -78,7 +78,3 @@ def get_results(self, conv_list, doc): for i, p in enumerate(predictions, 1): results.append(Document(None, None, p[0], p[1])) return results - - - - diff --git a/macaw/core/nlp_pipeline/__init__.py b/macaw/core/nlp_pipeline/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/macaw/core/nlp_pipeline/nlp_pipeline.py b/macaw/core/nlp_pipeline/nlp_pipeline.py new file mode 100644 index 0000000..358ade8 --- /dev/null +++ b/macaw/core/nlp_pipeline/nlp_pipeline.py @@ -0,0 +1,68 @@ +import json +import logging +import multiprocessing +import time +from multiprocessing import Pool +from typing import List + +import requests + +from core.interaction_handler import Message + + +class RemoteModel: + def __init__(self, model_name: str, endpoint: str): + self.model_name = model_name + self.endpoint = endpoint + + def run(self, request: dict) -> dict: + try: + response = requests.post(url=self.endpoint, data=json.dumps(request)) + return response.json() + except Exception as e: + return { + "response": f"Error in post request call for {self.model_name}: {e}", + "error": True + } + + +class NlpPipeline: + def __init__(self, modules: dict): + self.modules = dict() + self.logger = logging.getLogger("MacawLogger") + for model_name, endpoint in modules.items(): + self.modules[model_name] = RemoteModel(model_name, endpoint) + + def run(self, conv_list: List[Message]): + """ + Runs all the models and saves their results in the latest conversation (conv_list[0]) message. + """ + nlp_pipeline_result = {} + + with Pool(processes=4) as pool: + all_async_results = [] + for model_name, model in self.modules.items(): + # pass in required input to every model. + model_input = {"text": "input text for model"} + async_result = pool.apply_async( + func=model.run, + args=(model_input,) + ) + all_async_results.append((model_name, async_result)) + pool.close() + + max_wait_time_in_secs = 10 # wait time for all modules combined. + end_time = time.time() + max_wait_time_in_secs + for model_name, async_result in all_async_results: + try: + nlp_pipeline_result[model_name] = async_result.get(timeout=max(end_time - time.time(), 0)) + except multiprocessing.TimeoutError as te: + resp_msg = f"Module {model_name} timed out." + self.logger.error(f"{resp_msg} {te}") + nlp_pipeline_result[model_name] = { + "response": resp_msg, + "error": True, + } + pool.join() + + conv_list[0].nlp_pipeline_result = nlp_pipeline_result diff --git a/macaw/core/output_handler/naive_output_selection.py b/macaw/core/output_handler/naive_output_selection.py index c7487bc..33fe450 100644 --- a/macaw/core/output_handler/naive_output_selection.py +++ b/macaw/core/output_handler/naive_output_selection.py @@ -1,12 +1,12 @@ """ The naive output post processing unit. -Authors: Hamed Zamani (hazamani@microsoft.com) +Authors: Hamed Zamani (hazamani@microsoft.com), George Wei (gzwei@umass.edu) """ +from datetime import datetime -from macaw import util -from macaw.core.output_handler.output_selection import OutputProcessing from macaw.core.interaction_handler.msg import Message +from macaw.core.output_handler.output_selection import OutputProcessing class NaiveOutputProcessing(OutputProcessing): @@ -35,23 +35,25 @@ def output_selection(self, conv_list, candidate_outputs): Returns: A str denoting the selected action. If none is selected, None is returned. """ - if '#get_doc' in candidate_outputs: - return '#get_doc' - if 'qa' in candidate_outputs: - if len(candidate_outputs['qa'][0].text) > 0: - if conv_list[0].text.endswith('?') \ - or conv_list[0].text.lower().startswith('what') \ - or conv_list[0].text.lower().startswith('who') \ - or conv_list[0].text.lower().startswith('when') \ - or conv_list[0].text.lower().startswith('where') \ - or conv_list[0].text.lower().startswith('how'): - return 'qa' - if 'retrieval' in candidate_outputs: - if len(candidate_outputs['retrieval']) > 0: - return 'retrieval' + if "#get_doc" in candidate_outputs: + return "#get_doc" + if "qa" in candidate_outputs: + if len(candidate_outputs["qa"][0].text) > 0: + if ( + conv_list[0].text.endswith("?") + or conv_list[0].text.lower().startswith("what") + or conv_list[0].text.lower().startswith("who") + or conv_list[0].text.lower().startswith("when") + or conv_list[0].text.lower().startswith("where") + or conv_list[0].text.lower().startswith("how") + ): + return "qa" + if "retrieval" in candidate_outputs: + if len(candidate_outputs["retrieval"]) > 0: + return "retrieval" return None - def get_output(self, conv, candidate_outputs): + def get_output(self, conv_list, candidate_outputs): """ The response Message generation method. @@ -65,35 +67,46 @@ def get_output(self, conv, candidate_outputs): Returns: A response Message to be sent to the user. """ - user_id = conv[0].user_id - user_info = conv[0].user_info + user_id = conv_list[0].user_id + user_info = conv_list[0].user_info msg_info = dict() - msg_info['msg_id'] = conv[0].msg_info['msg_id'] - msg_info['msg_source'] = 'system' - text = '' - user_interface = conv[0].user_interface + msg_info["msg_id"] = conv_list[0].msg_info["msg_id"] + msg_info["msg_source"] = "system" + response = "" + user_interface = conv_list[0].user_interface - selected_action = self.output_selection(conv, candidate_outputs) + selected_action = self.output_selection(conv_list, candidate_outputs) if selected_action is None: - msg_info['msg_type'] = 'text' - msg_info['msg_creator'] = 'no answer error' - text = 'No response has been found! Please try again!' - elif selected_action == 'qa': - msg_info['msg_type'] = conv[0].msg_info['msg_type'] - msg_info['msg_creator'] = 'qa' - text = candidate_outputs['qa'][0].text - elif selected_action == 'retrieval': - msg_info['msg_type'] = 'options' - msg_info['msg_creator'] = 'retrieval' - text = 'Retrieved document list (click to see the document content):' - msg_info['options'] = [(output.title, '#get_doc ' + output.id, output.score) for output in candidate_outputs['retrieval']] - elif selected_action == '#get_doc': - msg_info['msg_type'] = 'text' - msg_info['msg_creator'] = '#get_doc' - text = candidate_outputs['#get_doc'][0].text + msg_info["msg_type"] = "text" + msg_info["msg_creator"] = "no answer error" + response = "No response has been found! Please try again!" + elif selected_action == "qa": + msg_info["msg_type"] = conv_list[0].msg_info["msg_type"] + msg_info["msg_creator"] = "qa" + response = candidate_outputs["qa"][0].text + elif selected_action == "retrieval": + msg_info["msg_type"] = "options" + msg_info["msg_creator"] = "retrieval" + response = "Retrieved document list (click to see the document content):" + msg_info["options"] = [ + (output.title, "#get_doc " + output.id, output.score) + for output in candidate_outputs["retrieval"] + ] + elif selected_action == "#get_doc": + msg_info["msg_type"] = "text" + msg_info["msg_creator"] = "#get_doc" + response = candidate_outputs["#get_doc"][0].text else: - raise Exception('The candidate output key is not familiar!') - timestamp = util.current_time_in_milliseconds() - if timestamp <= conv[0].timestamp: - raise Exception('There is a problem in the output timestamp!') - return Message(user_interface, user_id, user_info, msg_info, text, timestamp) \ No newline at end of file + raise Exception("The candidate output key is not familiar!") + timestamp = datetime.utcnow() + if timestamp <= conv_list[0].timestamp: + raise Exception("There is a problem in the output timestamp!") + return Message( + user_interface=user_interface, + user_id=user_id, + user_info=user_info, + msg_info=msg_info, + text=conv_list[0].text, + response=response, + timestamp=timestamp + ) diff --git a/macaw/core/response/__init__.py b/macaw/core/response/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/macaw/core/input_handler/action_detection.py b/macaw/core/response/action_detection.py similarity index 95% rename from macaw/core/input_handler/action_detection.py rename to macaw/core/response/action_detection.py index a49439f..dc4810e 100644 --- a/macaw/core/input_handler/action_detection.py +++ b/macaw/core/response/action_detection.py @@ -6,7 +6,7 @@ import multiprocessing -from macaw.core.input_handler import actions +from macaw.core.response import actions class PreActionRequestDispatcher: @@ -99,7 +99,10 @@ def dispatch(self, conv_list): manager = multiprocessing.Manager() action_results = manager.dict() for action in self.params['actions']: - p = multiprocessing.Process(target=actions.run_action, args=[action, conv_list.copy(), self.params, action_results]) + p = multiprocessing.Process( + target=actions.run_action, + args=[action, conv_list.copy(), self.params, action_results] + ) action_processes.append(p) p.start() @@ -107,7 +110,7 @@ def dispatch(self, conv_list): p.join() candidate_outputs = dict() - for key in action_results: + for key in action_results.keys(): if action_results[key]: candidate_outputs[key] = action_results[key] return candidate_outputs diff --git a/macaw/core/input_handler/actions.py b/macaw/core/response/actions.py similarity index 85% rename from macaw/core/input_handler/actions.py rename to macaw/core/response/actions.py index 1b8fb38..d877699 100644 --- a/macaw/core/input_handler/actions.py +++ b/macaw/core/response/actions.py @@ -4,9 +4,10 @@ Authors: Hamed Zamani (hazamani@microsoft.com) """ -from abc import ABC, abstractmethod -from func_timeout import func_timeout, FunctionTimedOut import traceback +from abc import ABC, abstractmethod + +from func_timeout import FunctionTimedOut, func_timeout class Action(ABC): @@ -38,7 +39,7 @@ def run(conv_list, params): Returns: A list of Documents. """ - return params['actions']['retrieval'].get_results(conv_list) + return params["actions"]["retrieval"].get_results(conv_list) class GetDocFromIndex(Action): @@ -54,7 +55,7 @@ def run(conv_list, params): Returns: A list of Documents with a length of 1. """ - return params['actions']['retrieval'].get_doc_from_index(params['doc_id']) + return params["actions"]["retrieval"].get_doc_from_index(params["doc_id"]) class QAAction(Action): @@ -73,12 +74,12 @@ def run(conv_list, params): """ doc_list = RetrievalAction.run(conv_list, params) - doc = '' + doc = "" for i in range(len(doc_list)): doc = doc_list[i].text if len(doc.strip()) > 0: break - return params['actions']['qa'].get_results(conv_list, doc) + return params["actions"]["qa"].get_results(conv_list, doc) def run_action(action, conv_list, params, return_dict): @@ -93,17 +94,21 @@ def run_action(action, conv_list, params, return_dict): return_dict(dict): A shared dict for all processes running this action. The actions' outputs should be added to this dict. """ - if action == 'retrieval': + if action == "retrieval": action_func = RetrievalAction.run - elif action == 'qa': + elif action == "qa": action_func = QAAction.run else: - raise Exception('Unknown Action!') + raise Exception("Unknown Action!") try: - return_dict[action] = func_timeout(params['timeout'], action_func, args=[conv_list, params]) + return_dict[action] = func_timeout( + params["timeout"], action_func, args=[conv_list, params] + ) except FunctionTimedOut: - params['logger'].warning('The action "%s" did not respond in %d seconds.', action, params['timeout']) + params["logger"].warning( + 'The action "%s" did not respond in %d seconds.', action, params["timeout"] + ) except Exception: return_dict[action] = None traceback.print_exc() diff --git a/macaw/core/response/docker.py b/macaw/core/response/docker.py new file mode 100644 index 0000000..fd75cdc --- /dev/null +++ b/macaw/core/response/docker.py @@ -0,0 +1,30 @@ +import json + +import requests + +from core.response.response_generator import ResponseGenerator + + +class ResponseGeneratorDocker(ResponseGenerator): + """ + A class that encapsulates an ML model running in a local docker container. + """ + + def __init__(self, name: str, endpoint: str): + super().__init__(name) + self.endpoint = endpoint + + def run(self, conv_list) -> dict: + # extract request from the conversation. + request = { + "text": "input message for RG docker model." + } + + try: + response = requests.post(url=self.endpoint, data=json.dumps(request)) + return response.json() + except Exception as e: + return { + "response": f"Error in post request call for {self.name}: {e}", + "error": True + } diff --git a/macaw/core/response/handler.py b/macaw/core/response/handler.py new file mode 100644 index 0000000..095ed96 --- /dev/null +++ b/macaw/core/response/handler.py @@ -0,0 +1,37 @@ +import logging +from typing import List + +from core.response.docker import ResponseGeneratorDocker +from core.response.response_generator import ResponseGenerator + + +class ResponseGeneratorHandler: + def __init__(self, rg_models: dict): + self.rg_models = dict() + self.logger = logging.getLogger("MacawLogger") + + for model_name, model in rg_models.items(): + if isinstance(model, ResponseGenerator): + self.rg_models[model_name] = model + elif isinstance(model, str) and model.startswith("http://"): + self.rg_models[model_name] = ResponseGeneratorDocker(model_name, model) + else: + self.logger.warning(f"Response generator {model_name}:{model} is not supported.") + + def models_selector(self, conv_list) -> List[str]: + selected_models = [] + for model_name in self.rg_models: + if model_name == "qa": + # decide if qa RG should be run or not here. + selected_models.append(model_name) + elif model_name == "punctuation": + selected_models.append(model_name) + else: + self.logger.warning(f"Model selector not written for {model_name}. Ignoring the model.") + return selected_models + + def run_models(self, model_names: List[str], conv_list) -> dict: + models_response = dict() + for model_name in model_names: + models_response[model_name] = self.rg_models[model_name].run(conv_list) + return models_response diff --git a/macaw/core/response/punctuation.py b/macaw/core/response/punctuation.py new file mode 100644 index 0000000..ef12b86 --- /dev/null +++ b/macaw/core/response/punctuation.py @@ -0,0 +1,34 @@ +import logging +import time + +from macaw.util.custom_logging import LoggerFactory +from deepmultilingualpunctuation import PunctuationModel +from core.response.response_generator import ResponseGenerator + + +class ResponseGeneratorPunctuation(ResponseGenerator): + def __init__(self, name): + super().__init__(name) + self.logger = LoggerFactory.create_logger(params={ + "logger_name": "macaw.core.response.response_generator.punctuation", + "logger_level": logging.INFO + }) + + self.logger.info("Going to download punctuation ML model. This takes time.") + # self.model = PunctuationModel() + self.logger.info("Punctuation model loaded successfully.") + + def run(self, conv_list) -> dict: + # generate input text from the input conversation. + input_text = "My name is Clara and I live in Berkeley California Ist das eine Frage Frau Müller" + + t0 = time.time() + # result = self.model.restore_punctuation(input_text) + result = "punctuation model output." + duration = time.time() - t0 + + return { + "response": f"Local RG: {self.name}. {result}", + "error": False, + "performance": duration + } diff --git a/macaw/core/response/response_generator.py b/macaw/core/response/response_generator.py new file mode 100644 index 0000000..a97aef7 --- /dev/null +++ b/macaw/core/response/response_generator.py @@ -0,0 +1,14 @@ +from abc import ABC, abstractmethod + + +class ResponseGenerator(ABC): + """ + An abstract class for response generator (RG). It can be implemented as a local RG using a local class or as a + remote RG using a docker container to generate the response. + """ + def __init__(self, name: str): + self.name = name + + @abstractmethod + def run(self, conv_list) -> dict: + pass diff --git a/macaw/core/retrieval/__init__.py b/macaw/core/retrieval/__init__.py index 994182e..2c0dabe 100644 --- a/macaw/core/retrieval/__init__.py +++ b/macaw/core/retrieval/__init__.py @@ -3,16 +3,18 @@ Authors: Hamed Zamani (hazamani@microsoft.com) """ +import logging + import macaw.core.retrieval.bing_api -import macaw.core.retrieval.indri -from macaw.core.retrieval import search_engine, query_generation +import macaw.core.retrieval.tantivy +from macaw.core.retrieval import query_generation def get_retrieval_model(params): """ This method returns the Retrieval class requested in the parameter dict. Args: - params(dict): A dict of parameters. In this method, the parameters 'logger' and 'query_generation', and + params(dict): A dict of parameters. In this method, the parameters 'query_generation', and 'search_engine' are required. Based on the requested retrievel model, some more parameters may be mandatory. Currently, Macaw serves two different search engines. One is based on indri (http://lemurproject.org/indri.php), and the other one is the Microsoft Bing API. If you want to retrieve results from your own document collection, @@ -21,24 +23,34 @@ def get_retrieval_model(params): Returns: A Retrieval object for document retrieval. """ - params['logger'].info('The query generation model for retrieval: ' + params['query_generation']) - if params['query_generation'] == 'simple': + logger = logging.getLogger("MacawLogger") + logger.info( + "The query generation model for retrieval: " + params["query_generation"] + ) + if params["query_generation"] == "simple": q_generation = query_generation.SimpleQueryGeneration(params) else: - raise Exception('The requested query generation model does not exist!') + raise Exception("The requested query generation model does not exist!") - params['logger'].info('The search engine for retrieval: ' + params['search_engine']) - if params['search_engine'] == 'indri': - return macaw.core.retrieval.indri.Indri({'query_generation': q_generation, - 'indri_path': params['search_engine_path'], - 'index': params['col_index'], - 'text_format': params['col_text_format'], - 'results_requested': params['results_requested'], - 'logger': params['logger']}) - elif params['search_engine'] == 'bing': - return macaw.core.retrieval.bing_api.BingWebSearch({'query_generation': q_generation, - 'bing_key': params['bing_key'], - 'results_requested': params['results_requested'], - 'logger': params['logger']}) + logger.info("The search engine for retrieval: " + params["search_engine"]) + if params["search_engine"] == "tantivy": + return macaw.core.retrieval.tantivy.Tantivy( + { + "query_generation": q_generation, + "path": params["search_engine_path"], + "load": True, + "results_requested": params["results_requested"], + "logger": logger, + } + ) + elif params["search_engine"] == "bing": + return macaw.core.retrieval.bing_api.BingWebSearch( + { + "query_generation": q_generation, + "bing_key": params["bing_key"], + "results_requested": params["results_requested"], + "logger": logger, + } + ) else: - raise Exception('The requested retrieval model does not exist!') \ No newline at end of file + raise Exception(f'{params["search_engine"]} retrieval model does not exist!') diff --git a/macaw/core/retrieval/indri.py b/macaw/core/retrieval/indri.py deleted file mode 100644 index 77bb073..0000000 --- a/macaw/core/retrieval/indri.py +++ /dev/null @@ -1,79 +0,0 @@ -""" -The Indri search engine. - -Authors: Hamed Zamani (hazamani@microsoft.com) -""" - -import os -import subprocess - -import pyndri - -from macaw.core.retrieval.doc import get_trec_doc -from macaw.core.retrieval.search_engine import Retrieval - - -class Indri(Retrieval): - def __init__(self, params): - """ - The Indri retrieval model. Indri is an open-source search engine implemented as part of the lemur project by - UMass Amherst and CMU. Refer to http://lemurproject.org/indri.php for more information. - The retrieval model used here is based on language modeling framework and retrieves documents using the query - likelihood retrieval model [Ponte & Croft; SIGIR 1998] and Dirichlet prior smoothing [Zhai and Lafferty; SIGIR - 2001]. It is implemented using the Pyndri [Van Gysel et al.; ECIR 2017], which is a python interface to Indri. - Refer to http://lemurproject.org/indri.php for more information on the Lemur toolkit. - - Args: - params(dict): A dict containing some parameters. Here is the list of all required parameters: - 'indri_path': The path to the installed Indri toolkit. - 'index': The path to the Indri index constructed from the collection. - 'results_requested': The maximum number of requested documents for retrieval. If not given, it is set to 1. - 'text_format': The text format for document collection (e.g., 'trectext'). - Note that the parameters 'query_generation' and 'logger' are required by the parent class. - """ - super().__init__(params) - self.results_requested = self.params['results_requested'] if 'results_requested' in self.params else 1 - self.indri_path = self.params['indri_path'] - self.index = pyndri.Index(self.params['index']) - self.term2id, self.id2term, self.id2df = self.index.get_dictionary() - self.id2tf = self.index.get_term_frequencies() - - def retrieve(self, query): - """ - This method retrieve documents in response to the given query. - - Args: - query(str): The query string. - - Returns: - A list of Documents with the maximum length of the 'results_requested' parameter. - """ - int_results = self.index.query(query, results_requested=self.results_requested) - results = [] - for int_doc_id, score in int_results: - # ext_doc_id, content_term_id = self.index.document(int_doc_id) - # index_content = [self.id2term[term_id] if term_id> 0 else 'UNK' for term_id in content_term_id] - doc = self.get_doc_from_index(int_doc_id)[0] - doc.score = score - doc.id = str(int_doc_id) - results.append(doc) - return results - - def get_doc_from_index(self, doc_id): - """ - This method retrieves a document content for a given document id. - - Args: - doc_id(str): The document ID. - - Returns: - A Document from the collection whose ID is equal to the given doc_id. For some reasons, the method returns - a list of Documents with a length of 1. - """ - content = subprocess.run([os.path.join(self.indri_path, 'dumpindex/dumpindex'), self.params['index'], - 'dt', str(doc_id)], stdout=subprocess.PIPE).stdout.decode('UTF-8') - if self.params['text_format'] == 'trectext': - doc = get_trec_doc(content) - else: - raise Exception('The requested text format is not supported!') - return [doc] \ No newline at end of file diff --git a/macaw/core/retrieval/query_generation.py b/macaw/core/retrieval/query_generation.py index 3047eb5..b1139b5 100644 --- a/macaw/core/retrieval/query_generation.py +++ b/macaw/core/retrieval/query_generation.py @@ -4,8 +4,8 @@ Authors: Hamed Zamani (hazamani@microsoft.com) """ -from abc import ABC, abstractmethod import string +from abc import ABC, abstractmethod class QueryGeneration(ABC): @@ -59,12 +59,14 @@ def get_query(self, conv_list): """ # q = ' '.join(msg.text for msg in conv_list) q = conv_list[0].text - if 'use_coref' in self.params and self.params['use_coref']: + if "use_coref" in self.params and self.params["use_coref"]: q_coref = self.get_query_coref(conv_list) for key in q_coref: - q += ' ' + ' '.join(q_coref[key]) + q += " " + " ".join(q_coref[key]) - q = q.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))).strip() + q = q.translate( + str.maketrans(string.punctuation, " " * len(string.punctuation)) + ).strip() # print(q) return q @@ -84,20 +86,20 @@ def get_query_coref(self, conv_list): """ corenlp_coref_result = self.compute_corefs(conv_list) q_coref = dict() - last_index = len(corenlp_coref_result['sentences']) - for key in corenlp_coref_result['corefs']: + last_index = len(corenlp_coref_result["sentences"]) + for key in corenlp_coref_result["corefs"]: has_coref = False - for item in corenlp_coref_result['corefs'][key]: - if item['sentNum'] == last_index: + for item in corenlp_coref_result["corefs"][key]: + if item["sentNum"] == last_index: has_coref = True - text = item['text'] + text = item["text"] break if has_coref: q_coref[text] = [] - for item in corenlp_coref_result['corefs'][key]: - if item['sentNum'] == last_index: + for item in corenlp_coref_result["corefs"][key]: + if item["sentNum"] == last_index: continue - q_coref[text].append(item['text']) + q_coref[text].append(item["text"]) return q_coref def compute_corefs(self, conv_list): @@ -115,15 +117,18 @@ def compute_corefs(self, conv_list): """ conv_history = [] for msg in reversed(conv_list): - if msg.msg_info['msg_source'] == 'user' and msg.msg_info['msg_type'] in ['text', 'voice']: - temp = msg.text if msg.text.endswith('?') else (msg.text + '?') + if msg.msg_info["msg_source"] == "user" and msg.msg_info["msg_type"] in [ + "text", + "voice", + ]: + temp = msg.text if msg.text.endswith("?") else (msg.text + "?") conv_history.append(temp) # elif msg.msg_info['msg_source'] == 'system' and msg.msg_info['msg_type'] == 'text' and len(msg.text.split()) < 30: # temp = msg.text + '.' # conv_history.append(temp) if len(conv_history) == 0: - raise Exception('The query generation model cannot generate any query! There should be a problem') - coref_results = self.params['nlp_util'].get_coref(' '.join(conv_history)) + raise Exception( + "The query generation model cannot generate any query! There should be a problem" + ) + coref_results = self.params["nlp_util"].get_coref(" ".join(conv_history)) return coref_results - - diff --git a/macaw/core/retrieval/tantivy.py b/macaw/core/retrieval/tantivy.py new file mode 100644 index 0000000..1e23c39 --- /dev/null +++ b/macaw/core/retrieval/tantivy.py @@ -0,0 +1,52 @@ +""" +The Tantivy search engine. + +Authors: Arkin Dharawat (adharawat@umass.edu) +""" + +import os + +import tantivy + +from macaw.core.retrieval.doc import get_trec_doc +from macaw.core.retrieval.search_engine import Retrieval + + +class Tantivy(Retrieval): + def __init__(self, params): + super().__init__(params) + self.results_requested = ( + self.params["results_requested"] + if "results_requested" in self.params + else 1 + ) + + if not os.path.exists(params["path"]): + os.mkdir(params["path"]) + schema_builder = tantivy.SchemaBuilder() + schema_builder.add_text_field("body", stored=True) + schema_builder.add_unsigned_field("doc_id", stored=True) + schema = schema_builder.build() + self.index = tantivy.Index(schema, path=params["path"], reuse=params["load"]) + self.searcher = self.index.searcher() + self.logger = params["logger"] + self.logger.info("Loaded index and searcher") + + def retrieve(self, query): + docs = [] + try: + query = self.index.parse_query(query, ["body"]) + scores = self.searcher.search(query, self.results_requested).hits + # docs = [(self.searcher.doc(doc_id)['doc_id'], score) for score, doc_id in scores] + docs = [ + get_trec_doc(self.searcher.doc(doc_id)["body"][0]) + for _, doc_id in scores + ] # convert the raw text into trec-doc + except Exception as e: + self.logger.error(f"Error on Query {e}") + return None + + return docs + + def get_doc_from_index(self, doc_id): + return [get_trec_doc(self.searcher.doc(doc_id)["body"])] diff --git a/macaw/docker/flask_app/Dockerfile b/macaw/docker/flask_app/Dockerfile new file mode 100644 index 0000000..dce887d --- /dev/null +++ b/macaw/docker/flask_app/Dockerfile @@ -0,0 +1,38 @@ +FROM python:3.8.13 + +RUN apt-get update && apt-get install -y \ + nginx \ + supervisor \ + gcc \ + g++ \ + curl \ + python3-pip \ + vim + +# update pip +RUN pip3 install pip --upgrade + +# Setup flask application +RUN mkdir -p /deploy/app +# copy and install requirements before so that other changes do not require +# downloading requirements again. +COPY app/requirements.txt /deploy/app/requirements.txt +RUN pip3 install -r /deploy/app/requirements.txt +COPY app /deploy/app + +# Setup nginx +RUN rm /etc/nginx/sites-enabled/default +COPY config/flask.conf /etc/nginx/sites-available/ +RUN ln -s /etc/nginx/sites-available/flask.conf /etc/nginx/sites-enabled/flask.conf +RUN echo "daemon off;" >> /etc/nginx/nginx.conf + +RUN ln -s /usr/local/bin/gunicorn /usr/bin/gunicorn +# Setup supervisord +RUN mkdir -p /var/log/supervisor +COPY config/supervisord.conf /etc/supervisor/conf.d/supervisord.conf +#COPY config/gunicorn.conf /etc/supervisor/conf.d/gunicorn.conf + +EXPOSE 80 + +# Start processes +CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"] diff --git a/macaw/docker/flask_app/app/app.py b/macaw/docker/flask_app/app/app.py new file mode 100644 index 0000000..9adbaef --- /dev/null +++ b/macaw/docker/flask_app/app/app.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import os +import time +import logging +import sys +from typing import Optional + +from flask import Flask, request, Response +from flask_restful import reqparse, Api, Resource + +import remote_module + +app = Flask("remote module") +api = Api(app) + +app.logger.addHandler(logging.StreamHandler(sys.stdout)) +app.logger.setLevel(logging.DEBUG) + + +class RemoteModule(Resource): + + def get(self): + return 200 + + def post(self): + t0 = time.time() + + args = request.get_json(force=True) + print(f"post request arguments {args}") + validation = self.__validate_input(args) + if validation: + return validation, 500 + + ret = {} + ret.update(self.__get_response(args)) + ret["performance"] = time.time() - t0 + ret["error"] = False + return ret, 200 + + @staticmethod + def __validate_input(args: dict) -> Optional[dict]: + message = "" + for ctx in remote_module.get_required_context(): + if not args.get(ctx): + message = "Context missing: " + ctx + if message: + return { + "message": message, + "error": True + } + return None + + @staticmethod + def __get_response(msg) -> dict: + response = remote_module.handle_message(msg) + app.logger.info("remote model result: %s", response) + return response + + +api.add_resource(RemoteModule, '/') + +if __name__ == '__main__': + app.run(host="127.0.0.1", port=os.environ.get('REMOTE_MODULE_PORT') or 8001) diff --git a/macaw/docker/flask_app/app/remote_module.py b/macaw/docker/flask_app/app/remote_module.py new file mode 100644 index 0000000..589f223 --- /dev/null +++ b/macaw/docker/flask_app/app/remote_module.py @@ -0,0 +1,15 @@ + +required_context = ["text"] + + +def get_required_context(): + return required_context + + +def handle_message(msg: dict) -> dict: + """ + Returns the response dictionary. Throws appropriate error if the model cannot generate a response. + """ + return { + "response": "okay from flask app." + } diff --git a/macaw/docker/flask_app/app/requirements.txt b/macaw/docker/flask_app/app/requirements.txt new file mode 100644 index 0000000..dea81b5 --- /dev/null +++ b/macaw/docker/flask_app/app/requirements.txt @@ -0,0 +1,11 @@ +Flask==2.1.2 +Flask-RESTful==0.3.9 +Werkzeug +future +jsonpickle +numpy +pandoc +six +typing +ConfigArgParse +gunicorn diff --git a/macaw/docker/flask_app/config/flask.conf b/macaw/docker/flask_app/config/flask.conf new file mode 100644 index 0000000..3b68b1c --- /dev/null +++ b/macaw/docker/flask_app/config/flask.conf @@ -0,0 +1,9 @@ +server { + listen 80; + + location / { + proxy_pass http://localhost:5001/; + proxy_set_header Host $host; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + } +} diff --git a/macaw/docker/flask_app/config/gunicorn.conf b/macaw/docker/flask_app/config/gunicorn.conf new file mode 100644 index 0000000..bfecb41 --- /dev/null +++ b/macaw/docker/flask_app/config/gunicorn.conf @@ -0,0 +1,3 @@ +[program:gunicorn] +command=/usr/bin/gunicorn app:app -b localhost:5001 +directory=/deploy/app diff --git a/macaw/docker/flask_app/config/supervisord.conf b/macaw/docker/flask_app/config/supervisord.conf new file mode 100644 index 0000000..a32f072 --- /dev/null +++ b/macaw/docker/flask_app/config/supervisord.conf @@ -0,0 +1,18 @@ +[supervisord] +nodaemon=true +user=root + +[program:nginx] +command=/usr/sbin/nginx +stdout_logfile=/dev/stdout +stdout_logfile_maxbytes=0 +stderr_logfile=/dev/stderr +stderr_logfile_maxbytes=0 + +[program:gunicorn] +command=/usr/bin/gunicorn app:app -w 1 -b localhost:5001 --log-level info +directory=/deploy/app +stdout_logfile=/dev/stdout +stdout_logfile_maxbytes=0 +stderr_logfile=/dev/stderr +stderr_logfile_maxbytes=0 diff --git a/macaw/docker/ic_app/Dockerfile b/macaw/docker/ic_app/Dockerfile new file mode 100644 index 0000000..dce887d --- /dev/null +++ b/macaw/docker/ic_app/Dockerfile @@ -0,0 +1,38 @@ +FROM python:3.8.13 + +RUN apt-get update && apt-get install -y \ + nginx \ + supervisor \ + gcc \ + g++ \ + curl \ + python3-pip \ + vim + +# update pip +RUN pip3 install pip --upgrade + +# Setup flask application +RUN mkdir -p /deploy/app +# copy and install requirements before so that other changes do not require +# downloading requirements again. +COPY app/requirements.txt /deploy/app/requirements.txt +RUN pip3 install -r /deploy/app/requirements.txt +COPY app /deploy/app + +# Setup nginx +RUN rm /etc/nginx/sites-enabled/default +COPY config/flask.conf /etc/nginx/sites-available/ +RUN ln -s /etc/nginx/sites-available/flask.conf /etc/nginx/sites-enabled/flask.conf +RUN echo "daemon off;" >> /etc/nginx/nginx.conf + +RUN ln -s /usr/local/bin/gunicorn /usr/bin/gunicorn +# Setup supervisord +RUN mkdir -p /var/log/supervisor +COPY config/supervisord.conf /etc/supervisor/conf.d/supervisord.conf +#COPY config/gunicorn.conf /etc/supervisor/conf.d/gunicorn.conf + +EXPOSE 80 + +# Start processes +CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"] diff --git a/macaw/docker/ic_app/app/app.py b/macaw/docker/ic_app/app/app.py new file mode 100644 index 0000000..9adbaef --- /dev/null +++ b/macaw/docker/ic_app/app/app.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import os +import time +import logging +import sys +from typing import Optional + +from flask import Flask, request, Response +from flask_restful import reqparse, Api, Resource + +import remote_module + +app = Flask("remote module") +api = Api(app) + +app.logger.addHandler(logging.StreamHandler(sys.stdout)) +app.logger.setLevel(logging.DEBUG) + + +class RemoteModule(Resource): + + def get(self): + return 200 + + def post(self): + t0 = time.time() + + args = request.get_json(force=True) + print(f"post request arguments {args}") + validation = self.__validate_input(args) + if validation: + return validation, 500 + + ret = {} + ret.update(self.__get_response(args)) + ret["performance"] = time.time() - t0 + ret["error"] = False + return ret, 200 + + @staticmethod + def __validate_input(args: dict) -> Optional[dict]: + message = "" + for ctx in remote_module.get_required_context(): + if not args.get(ctx): + message = "Context missing: " + ctx + if message: + return { + "message": message, + "error": True + } + return None + + @staticmethod + def __get_response(msg) -> dict: + response = remote_module.handle_message(msg) + app.logger.info("remote model result: %s", response) + return response + + +api.add_resource(RemoteModule, '/') + +if __name__ == '__main__': + app.run(host="127.0.0.1", port=os.environ.get('REMOTE_MODULE_PORT') or 8001) diff --git a/macaw/docker/ic_app/app/remote_module.py b/macaw/docker/ic_app/app/remote_module.py new file mode 100644 index 0000000..bff776c --- /dev/null +++ b/macaw/docker/ic_app/app/remote_module.py @@ -0,0 +1,15 @@ + +required_context = ["text"] + + +def get_required_context(): + return required_context + + +def handle_message(msg: dict) -> dict: + """ + Returns the response dictionary. Throws appropriate error if the model cannot generate a response. + """ + return { + "response": "okay from ic model." + } diff --git a/macaw/docker/ic_app/app/requirements.txt b/macaw/docker/ic_app/app/requirements.txt new file mode 100644 index 0000000..dea81b5 --- /dev/null +++ b/macaw/docker/ic_app/app/requirements.txt @@ -0,0 +1,11 @@ +Flask==2.1.2 +Flask-RESTful==0.3.9 +Werkzeug +future +jsonpickle +numpy +pandoc +six +typing +ConfigArgParse +gunicorn diff --git a/macaw/docker/ic_app/config/flask.conf b/macaw/docker/ic_app/config/flask.conf new file mode 100644 index 0000000..3b68b1c --- /dev/null +++ b/macaw/docker/ic_app/config/flask.conf @@ -0,0 +1,9 @@ +server { + listen 80; + + location / { + proxy_pass http://localhost:5001/; + proxy_set_header Host $host; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + } +} diff --git a/macaw/docker/ic_app/config/gunicorn.conf b/macaw/docker/ic_app/config/gunicorn.conf new file mode 100644 index 0000000..bfecb41 --- /dev/null +++ b/macaw/docker/ic_app/config/gunicorn.conf @@ -0,0 +1,3 @@ +[program:gunicorn] +command=/usr/bin/gunicorn app:app -b localhost:5001 +directory=/deploy/app diff --git a/macaw/docker/ic_app/config/supervisord.conf b/macaw/docker/ic_app/config/supervisord.conf new file mode 100644 index 0000000..a32f072 --- /dev/null +++ b/macaw/docker/ic_app/config/supervisord.conf @@ -0,0 +1,18 @@ +[supervisord] +nodaemon=true +user=root + +[program:nginx] +command=/usr/sbin/nginx +stdout_logfile=/dev/stdout +stdout_logfile_maxbytes=0 +stderr_logfile=/dev/stderr +stderr_logfile_maxbytes=0 + +[program:gunicorn] +command=/usr/bin/gunicorn app:app -w 1 -b localhost:5001 --log-level info +directory=/deploy/app +stdout_logfile=/dev/stdout +stdout_logfile_maxbytes=0 +stderr_logfile=/dev/stderr +stderr_logfile_maxbytes=0 diff --git a/macaw/docker/qa_app/Dockerfile b/macaw/docker/qa_app/Dockerfile new file mode 100644 index 0000000..dce887d --- /dev/null +++ b/macaw/docker/qa_app/Dockerfile @@ -0,0 +1,38 @@ +FROM python:3.8.13 + +RUN apt-get update && apt-get install -y \ + nginx \ + supervisor \ + gcc \ + g++ \ + curl \ + python3-pip \ + vim + +# update pip +RUN pip3 install pip --upgrade + +# Setup flask application +RUN mkdir -p /deploy/app +# copy and install requirements before so that other changes do not require +# downloading requirements again. +COPY app/requirements.txt /deploy/app/requirements.txt +RUN pip3 install -r /deploy/app/requirements.txt +COPY app /deploy/app + +# Setup nginx +RUN rm /etc/nginx/sites-enabled/default +COPY config/flask.conf /etc/nginx/sites-available/ +RUN ln -s /etc/nginx/sites-available/flask.conf /etc/nginx/sites-enabled/flask.conf +RUN echo "daemon off;" >> /etc/nginx/nginx.conf + +RUN ln -s /usr/local/bin/gunicorn /usr/bin/gunicorn +# Setup supervisord +RUN mkdir -p /var/log/supervisor +COPY config/supervisord.conf /etc/supervisor/conf.d/supervisord.conf +#COPY config/gunicorn.conf /etc/supervisor/conf.d/gunicorn.conf + +EXPOSE 80 + +# Start processes +CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"] diff --git a/macaw/docker/qa_app/app/app.py b/macaw/docker/qa_app/app/app.py new file mode 100644 index 0000000..eb48d75 --- /dev/null +++ b/macaw/docker/qa_app/app/app.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import os +import time +import logging +import sys +from typing import Optional + +from flask import Flask, request, Response +from flask_restful import reqparse, Api, Resource + +import remote_module + +app = Flask("remote module") +api = Api(app) + +app.logger.addHandler(logging.StreamHandler(sys.stdout)) +app.logger.setLevel(logging.DEBUG) + + +class RemoteModule(Resource): + + def get(self): + return 200 + + def post(self): + t0 = time.time() + + args = request.get_json(force=True) + print(f"post request arguments {args}") + validation = self.__validate_input(args) + if validation: + return validation, 500 + + ret = {} + ret.update(self.__get_response(args)) + ret["performance"] = time.time() - t0 + ret["error"] = False + return ret, 200 + + @staticmethod + def __validate_input(args: dict) -> Optional[dict]: + message = "" + for ctx in remote_module.get_required_context(): + if not args.get(ctx): + message = "Context missing: " + ctx + if message: + return { + "message": message, + "error": True + } + return None + + @staticmethod + def __get_response(msg) -> dict: + response = remote_module.handle_message(msg) + app.logger.info("remote model result: %s", response) + return response + + +api.add_resource(RemoteModule, '/') + +if __name__ == '__main__': + app.run(host="127.0.0.1", port=os.environ.get('REMOTE_MODULE_PORT') or 8003) diff --git a/macaw/docker/qa_app/app/remote_module.py b/macaw/docker/qa_app/app/remote_module.py new file mode 100644 index 0000000..af29ba8 --- /dev/null +++ b/macaw/docker/qa_app/app/remote_module.py @@ -0,0 +1,15 @@ + +required_context = ["text"] + + +def get_required_context(): + return required_context + + +def handle_message(msg: dict) -> dict: + """ + Returns the response dictionary. Throws appropriate error if the model cannot generate a response. + """ + return { + "response": "okay response from qa model." + } diff --git a/macaw/docker/qa_app/app/requirements.txt b/macaw/docker/qa_app/app/requirements.txt new file mode 100644 index 0000000..6ad266d --- /dev/null +++ b/macaw/docker/qa_app/app/requirements.txt @@ -0,0 +1,15 @@ +Flask==2.1.2 +Flask-RESTful==0.3.9 +Werkzeug +future +jsonpickle +numpy +pandoc +six +typing +ConfigArgParse +gunicorn +transformers==4.11.2 +itsdangerous==2.0.1 +torch +boto3 diff --git a/macaw/docker/qa_app/config/flask.conf b/macaw/docker/qa_app/config/flask.conf new file mode 100644 index 0000000..3b68b1c --- /dev/null +++ b/macaw/docker/qa_app/config/flask.conf @@ -0,0 +1,9 @@ +server { + listen 80; + + location / { + proxy_pass http://localhost:5001/; + proxy_set_header Host $host; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + } +} diff --git a/macaw/docker/qa_app/config/gunicorn.conf b/macaw/docker/qa_app/config/gunicorn.conf new file mode 100644 index 0000000..bfecb41 --- /dev/null +++ b/macaw/docker/qa_app/config/gunicorn.conf @@ -0,0 +1,3 @@ +[program:gunicorn] +command=/usr/bin/gunicorn app:app -b localhost:5001 +directory=/deploy/app diff --git a/macaw/docker/qa_app/config/supervisord.conf b/macaw/docker/qa_app/config/supervisord.conf new file mode 100644 index 0000000..a32f072 --- /dev/null +++ b/macaw/docker/qa_app/config/supervisord.conf @@ -0,0 +1,18 @@ +[supervisord] +nodaemon=true +user=root + +[program:nginx] +command=/usr/sbin/nginx +stdout_logfile=/dev/stdout +stdout_logfile_maxbytes=0 +stderr_logfile=/dev/stderr +stderr_logfile_maxbytes=0 + +[program:gunicorn] +command=/usr/bin/gunicorn app:app -w 1 -b localhost:5001 --log-level info +directory=/deploy/app +stdout_logfile=/dev/stdout +stdout_logfile_maxbytes=0 +stderr_logfile=/dev/stderr +stderr_logfile_maxbytes=0 diff --git a/macaw/interface/__init__.py b/macaw/interface/__init__.py index efebf6c..cc504ed 100644 --- a/macaw/interface/__init__.py +++ b/macaw/interface/__init__.py @@ -4,20 +4,20 @@ Authors: Hamed Zamani (hazamani@microsoft.com) """ -from macaw.interface import speech_recognition, telegram, stdio, fileio +from macaw.interface import fileio, speech_recognition, stdio, telegram def get_interface(params): - if 'asr_model' in params and params['asr_model'] == 'google': - params['asr'] = speech_recognition.GoogleASR(params) - if 'asg_model' in params and params['asg_model'] == 'google': - params['asg'] = speech_recognition.GoogleText2Speech(params) + if "asr_model" in params and params["asr_model"] == "google": + params["asr"] = speech_recognition.GoogleASR(params) + if "asg_model" in params and params["asg_model"] == "google": + params["asg"] = speech_recognition.GoogleText2Speech(params) - if params['interface'] == 'telegram': + if params["interface"] == "telegram": return telegram.TelegramBot(params) - elif params['interface'] == 'stdio': + elif params["interface"] == "stdio": return stdio.StdioInterface(params) - elif params['interface'] == 'fileio': + elif params["interface"] == "fileio": return fileio.FileioInterface(params) else: - raise Exception('The requested interface does not exist!') \ No newline at end of file + raise Exception("The requested interface does not exist!") diff --git a/macaw/interface/fileio.py b/macaw/interface/fileio.py index 467fb38..16e90dc 100644 --- a/macaw/interface/fileio.py +++ b/macaw/interface/fileio.py @@ -3,11 +3,12 @@ Authors: Hamed Zamani (hazamani@microsoft.com) """ - +import json import time +from datetime import datetime -from macaw.interface.interface import Interface from macaw.core.interaction_handler.msg import Message +from macaw.interface.interface import Interface class FileioInterface(Interface): @@ -16,46 +17,85 @@ def __init__(self, params): self.msg_id = int(time.time()) def run(self): - output_file = open(self.params['output_file_path'], 'w+') - with open(self.params['input_file_path']) as input_file: + with open(self.params["output_file_path"], "w+") as output_file_handler, \ + open(self.params["verbose_output_file_path"], "w+") as verbose_file_handler, \ + open(self.params["input_file_path"]) as input_file: for line in input_file: - str_list = line.strip().split('\t') - if len(str_list) < 2: - raise Exception('Each input line should contain at least 2 elements: a query ID and a query text.') + str_list = line.strip().split("\t") + if len(str_list) != 2: + raise Exception( + f"Each input line should contain at least 2 elements: a query ID and a query text." + f"Invalid line: {line}" + ) qid = str_list[0] - conv_list = [] - for i in range(1, len(str_list)): - user_info = {'first_name': 'NONE'} - msg_info = {'msg_id': qid, - 'msg_type': 'text', - 'msg_source': 'user'} - msg = Message(user_interface='NONE', - user_id=-1, - user_info=user_info, - msg_info=msg_info, - text=str_list[i], - timestamp=-1) - conv_list.append(msg) - conv_list.reverse() - output_msg = self.params['experimental_request_handler'](conv_list) - self.result_presentation(output_msg, {'output_file': output_file, 'qid': qid}) - output_file.close() - - def result_presentation(self, output_msg, params): - qid = params['qid'] - output_file = params['output_file'] - if self.params['output_format'] == 'trec': - if output_msg.msg_info['msg_type'] == 'options': - for (i, (option_name, option_id, output_score)) in enumerate(output_msg.msg_info['options']): - output_file.write(qid + '\tQ0\t' + option_name + '\t' + str(i+1) + '\t' + str(output_score) + '\tmacaw\n') + user_info = {"first_name": "NONE"} + msg_info = {"msg_id": qid, "msg_type": "text", "msg_source": "user"} + msg = Message( + user_interface="fileio", + user_id=-1, + user_info=user_info, + msg_info=msg_info, + text=str_list[1], + timestamp=datetime.utcnow(), + ) + output_msg = self.params["experimental_request_handler"](msg) + self.result_presentation( + output_msg, + { + "qid": qid, + "output_file_handler": output_file_handler, + "verbose_file_handler": verbose_file_handler + } + ) + + def result_presentation(self, response_msg: Message, additional_params: dict): + """ + This method writes the result of this turn into output files. Files are already opened before calling this + method and handlers passed in as arguments. + + response_msg: the result message generated for this turn. + additional_params: dict having params required by this method not present in the self.params class variable. + """ + qid = additional_params["qid"] + output_fh = additional_params["output_file_handler"] + verbose_fh = additional_params["verbose_file_handler"] + + verbose_fh.write(json.dumps(dict(response_msg))) + verbose_fh.write("\n") + + if self.params["output_format"] == "trec": + if response_msg.msg_info["msg_type"] == "options": + for (i, (option_name, option_id, output_score)) in enumerate( + response_msg.msg_info["options"] + ): + output_fh.write( + qid + + "\tQ0\t" + + option_name + + "\t" + + str(i + 1) + + "\t" + + str(output_score) + + "\tmacaw\n" + ) else: - raise Exception('TREC output format is only recognized for retrieval results. ' - 'Therefore, the message type should be options.') - elif self.params['output_format'] == 'text': - if output_msg.msg_info['msg_type'] == 'text': - output_file.write(qid + '\t' + output_msg.text.replace('\n', ' ').replace('\t', ' ') + '\n') + raise Exception( + "TREC output format is only recognized for retrieval results. " + "Therefore, the message type should be options." + ) + elif self.params["output_format"] == "text": + msg_type = response_msg.msg_info["msg_type"] + if msg_type == "text": + output_fh.write( + qid + + "\t" + + response_msg.response.replace("\n", " ").replace("\t", " ") + + "\n" + ) else: - raise Exception('text output format is only recognized for text outputs.') + raise Exception( + f"Text output format is only recognized for text outputs. Found {msg_type}." + ) else: - raise Exception('Unknown output file format!') + raise Exception("Unknown output file format!") diff --git a/macaw/interface/interface.py b/macaw/interface/interface.py index 63312ab..e2638be 100644 --- a/macaw/interface/interface.py +++ b/macaw/interface/interface.py @@ -6,6 +6,8 @@ from abc import ABC, abstractmethod +from core.interaction_handler import Message + class Interface(ABC): def __init__(self, params): @@ -16,5 +18,5 @@ def run(self): pass @abstractmethod - def result_presentation(self, response_msg, params): - pass \ No newline at end of file + def result_presentation(self, response_msg: Message, additional_params: dict): + pass diff --git a/macaw/interface/speech_recognition.py b/macaw/interface/speech_recognition.py index 52a3ff1..467cfbc 100644 --- a/macaw/interface/speech_recognition.py +++ b/macaw/interface/speech_recognition.py @@ -4,30 +4,32 @@ Authors: Hamed Zamani (hazamani@microsoft.com) """ -from abc import ABC, abstractmethod import os import tempfile +from abc import ABC, abstractmethod import speech_recognition as sr from google.cloud import texttospeech from pydub import AudioSegment -def mp3_to_ogg(input_file_name): # caller should delete the file afterwards. +def mp3_to_ogg(input_file_name): # caller should delete the file afterwards. ogg_file = tempfile.NamedTemporaryFile(delete=False) - AudioSegment.from_mp3(input_file_name).export(ogg_file.name, format='ogg', parameters=["-acodec", "libopus"]) + AudioSegment.from_mp3(input_file_name).export( + ogg_file.name, format="ogg", parameters=["-acodec", "libopus"] + ) ogg_file.close() return ogg_file.name -def ogg_to_wav(input_file_name): # caller should delete the file afterwards. +def ogg_to_wav(input_file_name): # caller should delete the file afterwards. wav_file = tempfile.NamedTemporaryFile(delete=False) - AudioSegment.from_ogg(input_file_name).export(wav_file.name, format='wav') + AudioSegment.from_ogg(input_file_name).export(wav_file.name, format="wav") wav_file.close() return wav_file.name -class ASR(ABC): # Automatic Speech Recognition +class ASR(ABC): # Automatic Speech Recognition def __init__(self, params): self.params = params @@ -36,7 +38,7 @@ def speech_to_text(self, file_path): pass -class ASG(ABC): # Automatic Speech Generation +class ASG(ABC): # Automatic Speech Generation def __init__(self, params): self.params = params @@ -62,23 +64,31 @@ def speech_to_text(self, file_path): except sr.UnknownValueError: print("Google Speech Recognition could not understand audio") except sr.RequestError as e: - print("Could not request results from Google Speech Recognition service; {0}".format(e)) + print( + "Could not request results from Google Speech Recognition service; {0}".format( + e + ) + ) class GoogleText2Speech(ASG): def __init__(self, params): super().__init__(params) - os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = self.params['google-speech-to-text-credential-file'] + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self.params[ + "google-speech-to-text-credential-file" + ] # Instantiates a client self.client = texttospeech.TextToSpeechClient() # Build the voice request, select the language code ("en-US") and the ssml # voice gender ("neutral") self.voice = texttospeech.types.VoiceSelectionParams( - language_code='en-US', - ssml_gender=texttospeech.enums.SsmlVoiceGender.NEUTRAL) + language_code="en-US", + ssml_gender=texttospeech.enums.SsmlVoiceGender.NEUTRAL, + ) # Select the type of audio file you want returned self.audio_config = texttospeech.types.AudioConfig( - audio_encoding=texttospeech.enums.AudioEncoding.MP3) + audio_encoding=texttospeech.enums.AudioEncoding.MP3 + ) def text_to_speech(self, text): # Set the text input to be synthesized @@ -86,11 +96,12 @@ def text_to_speech(self, text): # Perform the text-to-speech request on the text input with the selected # voice parameters and audio file type - response = self.client.synthesize_speech(synthesis_input, self.voice, self.audio_config) + response = self.client.synthesize_speech( + synthesis_input, self.voice, self.audio_config + ) mp3_file = tempfile.NamedTemporaryFile(delete=True) mp3_file.write(response.audio_content) ogg_file_name = mp3_to_ogg(mp3_file.name) mp3_file.close() return ogg_file_name - diff --git a/macaw/interface/stdio.py b/macaw/interface/stdio.py index aca7aa0..149dfb2 100644 --- a/macaw/interface/stdio.py +++ b/macaw/interface/stdio.py @@ -1,15 +1,15 @@ """ The STDIO interface for interactive CIS. -Authors: Hamed Zamani (hazamani@microsoft.com) +Authors: Hamed Zamani (hazamani@microsoft.com), George Wei (gzwei@umass.edu) """ - +import json import time import traceback +from datetime import datetime -from macaw import util -from macaw.interface.interface import Interface from macaw.core.interaction_handler.msg import Message +from macaw.interface.interface import Interface class StdioInterface(Interface): @@ -18,44 +18,60 @@ def __init__(self, params): self.msg_id = int(time.time()) def run(self): - while True: - try: - request = input('ENTER YOUR COMMAND: ').strip() - if len(request) == 0: - continue - user_info = {'first_name': 'STDIO', - 'is_bot': 'False' - } - msg_info = {'msg_id': self.msg_id, - 'msg_type': 'command' if request.startswith('#') else 'text', - 'msg_source': 'user'} - self.msg_id += 1 - msg = Message(user_interface='stdio', - user_id=-1, - user_info=user_info, - msg_info=msg_info, - text=request, - timestamp=util.current_time_in_milliseconds()) - output = self.params['live_request_handler'](msg) - self.result_presentation(output, {}) - except Exception as ex: - traceback.print_exc() + print(f"Inside StdioInterface run method.") + with open(self.params["verbose_output_file_path"], "w+") as verbose_file: + while True: + try: + request = input("ENTER YOUR COMMAND (type 'exit' to leave): ").strip() + if request == "exit": + break + if len(request) == 0: + continue + user_info = {"first_name": "STDIO", "is_bot": "False"} + msg_info = { + "msg_id": self.msg_id, + "msg_type": "command" if request.startswith("#") else "text", + "msg_source": "user", + } + self.msg_id += 1 + msg = Message( + user_interface="stdio", + user_id=-1, + text=request, + timestamp=datetime.utcnow(), + user_info=user_info, + msg_info=msg_info, + ) + output = self.params["live_request_handler"](msg) + self.result_presentation(output, {"verbose_file_handler": verbose_file}) + except Exception as ex: + traceback.print_exc() + + def result_presentation(self, response_msg: Message, additional_params: dict): + verbose_fh = additional_params["verbose_file_handler"] + verbose_fh.write(json.dumps(dict(response_msg)) + "\n") - def result_presentation(self, response_msg, params): try: - print('THE RESPONSE STARTS') - print('----------------------------------------------------------------------') - if response_msg.msg_info['msg_type'] == 'text': - print(response_msg.text) - elif response_msg.msg_info['msg_type'] == 'options': - for (option_text, option_data, output_score) in response_msg.msg_info['options']: - print(option_data, ' | ', option_text) - elif response_msg.msg_info['msg_type'] == 'error': - print('ERROR: NO RESULT!') + print("THE RESPONSE STARTS") + print( + "----------------------------------------------------------------------" + ) + if response_msg.msg_info["msg_type"] == "text": + print(response_msg.response) + elif response_msg.msg_info["msg_type"] == "options": + for (option_text, option_data, output_score) in response_msg.msg_info[ + "options" + ]: + print(option_data, " | ", option_text) + elif response_msg.msg_info["msg_type"] == "error": + print("ERROR: NO RESULT!") else: - raise Exception('The msg_type is not recognized:', response_msg.msg_info['msg_type']) - print('----------------------------------------------------------------------') - print('THE RESPONSE STARTS') + raise Exception( + "The msg_type is not recognized:", response_msg.msg_info["msg_type"] + ) + print( + "----------------------------------------------------------------------" + ) + print("THE RESPONSE ENDS") except Exception as ex: traceback.print_exc() - diff --git a/macaw/interface/telegram.py b/macaw/interface/telegram.py index b6847a8..d0951c8 100644 --- a/macaw/interface/telegram.py +++ b/macaw/interface/telegram.py @@ -1,18 +1,24 @@ """ The Telegram bot (supports interactive multi-modal interactions with different devices). -Authors: Hamed Zamani (hazamani@microsoft.com) +Authors: Hamed Zamani (hazamani@microsoft.com), George Wei (gzwei@umass.edu) """ -import urllib.parse import os import tempfile import traceback +import urllib.parse +from datetime import datetime from telegram import InlineKeyboardButton, InlineKeyboardMarkup -from telegram.ext import Updater, CommandHandler, MessageHandler, Filters, CallbackQueryHandler +from telegram.ext import ( + CallbackQueryHandler, + CommandHandler, + Filters, + MessageHandler, + Updater, +) -from macaw import util from macaw.core.interaction_handler.msg import Message from macaw.interface.interface import Interface @@ -26,21 +32,25 @@ def __init__(self, params): params(dict): A dict of parameters. The params 'logger' and 'bot_token' are mandatory. """ super().__init__(params) - self.logger = self.params['logger'] + self.logger = self.params["logger"] - self.MAX_MSG_LEN = 1000 # maximum number of characters in each response message. - self.MAX_OPTION_LEN = 30 # maximum number of characters in each clickable option text. + self.MAX_MSG_LEN = ( + 1000 # maximum number of characters in each response message. + ) + self.MAX_OPTION_LEN = ( + 30 # maximum number of characters in each clickable option text. + ) # Starting the bot by creating the Updater. # Make sure to set use_context=True to use the new context based callbacks # If you don't have a bot_token, add 'botfather' to your personal Telegram account and follow the instructions # to get a token for your bot. - self.updater = Updater(self.params['bot_token'], use_context=True) + self.updater = Updater(self.params["bot_token"], use_context=True) self.dp = self.updater.dispatcher # Telegram command handlers (e.g., /start) - self.dp.add_handler(CommandHandler('start', self.start)) - self.dp.add_handler(CommandHandler('help', self.help)) + self.dp.add_handler(CommandHandler("start", self.start)) + self.dp.add_handler(CommandHandler("help", self.help)) # Telegram message handlers self.dp.add_handler(MessageHandler(Filters.text, self.request_handler)) @@ -52,32 +62,41 @@ def __init__(self, params): def start(self, update, context): """Send a message when the command /start is issued.""" - update.message.reply_text('Hi, welcome to Macaw! Macaw is an open-source extensible framework for ' - 'conversational information seeking. Visit: https://github.com/microsoft/macaw') + update.message.reply_text( + "Hi, welcome to Macaw! Macaw is an open-source extensible framework for " + "conversational information seeking. Visit: https://github.com/microsoft/macaw" + ) def help(self, update, context): """Send a message when the command /help is issued.""" - update.message.reply_text('Macaw should be able to answer your questions. Just ask a question!') + update.message.reply_text( + "Macaw should be able to answer your questions. Just ask a question!" + ) def request_handler(self, update, context): """This method handles all text messages, and asks result_presentation to send the response to the user.""" try: self.logger.info(update.message) - user_info = {'first_name': update.message.chat.first_name, - 'last_name': update.message.chat.last_name, - 'is_bot': update._effective_user.is_bot + user_info = { + "first_name": update.message.chat.first_name, + "last_name": update.message.chat.last_name, + "is_bot": update._effective_user.is_bot, + } + msg_info = { + "msg_id": update.message.message_id, + "msg_type": "text", + "msg_source": "user", } - msg_info = {'msg_id': update.message.message_id, - 'msg_type': 'text', - 'msg_source': 'user'} - msg = Message(user_interface='telegram', - user_id=update.message.chat.id, - user_info=user_info, - msg_info=msg_info, - text=update.message.text, - timestamp=util.current_time_in_milliseconds()) - output = self.params['live_request_handler'](msg) - self.result_presentation(output, {'update': update}) + msg = Message( + user_interface="telegram", + user_id=update.message.chat.id, + user_info=user_info, + msg_info=msg_info, + text=update.message.text, + timestamp=datetime.utcnow(), + ) + output = self.params["live_request_handler"](msg) + self.result_presentation(output, {"update": update}) except Exception: traceback.print_exc() @@ -86,25 +105,30 @@ def voice_request_handler(self, update, context): try: ogg_file = tempfile.NamedTemporaryFile(delete=True) update.message.voice.get_file().download(ogg_file.name) - text = self.params['asr'].speech_to_text(ogg_file.name) + text = self.params["asr"].speech_to_text(ogg_file.name) ogg_file.close() - update.message.reply_text('Macaw heard: ' + text) - - user_info = {'first_name': update.message.chat.first_name, - 'last_name': update.message.chat.last_name, - 'is_bot': update._effective_user.is_bot - } - msg_info = {'msg_id': update.message.message_id, - 'msg_type': 'voice', - 'msg_source': 'user'} - msg = Message(user_interface='telegram', - user_id=update.message.chat.id, - user_info=user_info, - msg_info=msg_info, - text=text, - timestamp=util.current_time_in_milliseconds()) - output = self.params['live_request_handler'](msg) - self.result_presentation(output, {'update': update}) + update.message.reply_text("Macaw heard: " + text) + + user_info = { + "first_name": update.message.chat.first_name, + "last_name": update.message.chat.last_name, + "is_bot": update._effective_user.is_bot, + } + msg_info = { + "msg_id": update.message.message_id, + "msg_type": "voice", + "msg_source": "user", + } + msg = Message( + user_interface="telegram", + user_id=update.message.chat.id, + user_info=user_info, + msg_info=msg_info, + text=text, + timestamp=datetime.utcnow(), + ) + output = self.params["live_request_handler"](msg) + self.result_presentation(output, {"update": update}) except Exception: traceback.print_exc() @@ -112,53 +136,78 @@ def button_click_handler(self, update, context): """This method handles clicks, and asks result_presentation to send the response to the user.""" try: self.logger.info(update) - user_info = {'first_name': update.callback_query.message.chat.first_name, - 'last_name': update.callback_query.message.chat.last_name, - 'is_bot': update._effective_user.is_bot - } - msg_info = {'msg_id': update.callback_query.message.message_id, - 'msg_type': 'command', - 'msg_source': 'user'} - msg = Message(user_interface='telegram', - user_id=update.callback_query.message.chat.id, - user_info=user_info, - msg_info=msg_info, - text=update.callback_query.data, - timestamp=util.current_time_in_milliseconds()) - output = self.params['live_request_handler'](msg) - self.result_presentation(output, {'update': update}) + user_info = { + "first_name": update.callback_query.message.chat.first_name, + "last_name": update.callback_query.message.chat.last_name, + "is_bot": update._effective_user.is_bot, + } + msg_info = { + "msg_id": update.callback_query.message.message_id, + "msg_type": "command", + "msg_source": "user", + } + msg = Message( + user_interface="telegram", + user_id=update.callback_query.message.chat.id, + user_info=user_info, + msg_info=msg_info, + text=update.callback_query.data, + timestamp=datetime.utcnow(), + ) + output = self.params["live_request_handler"](msg) + self.result_presentation(output, {"update": update}) except Exception as ex: traceback.print_exc() - def result_presentation(self, response_msg, params): + def result_presentation(self, response_msg, additional_params): """This method produces an appropriate response to be sent to the client.""" try: if response_msg is None: return - update = params['update'] - if response_msg.msg_info['msg_type'] == 'text': + update = additional_params["update"] + if response_msg.msg_info["msg_type"] == "text": if update.message is not None: - update.message.reply_text(response_msg.text[:self.MAX_MSG_LEN]) + update.message.reply_text(response_msg.text[: self.MAX_MSG_LEN]) elif update.callback_query.message is not None: - update.callback_query.message.reply_text(response_msg.text[:self.MAX_MSG_LEN]) - elif response_msg.msg_info['msg_type'] == 'voice': - ogg_file_name = self.params['asg'].text_to_speech(response_msg.text[:self.MAX_MSG_LEN]) - self.updater.bot.send_voice(chat_id=update.message.chat.id, voice=open(ogg_file_name, 'rb')) + update.callback_query.message.reply_text( + response_msg.text[: self.MAX_MSG_LEN] + ) + elif response_msg.msg_info["msg_type"] == "voice": + ogg_file_name = self.params["asg"].text_to_speech( + response_msg.text[: self.MAX_MSG_LEN] + ) + self.updater.bot.send_voice( + chat_id=update.message.chat.id, voice=open(ogg_file_name, "rb") + ) os.remove(ogg_file_name) # removing audio files for privacy reasons. - elif response_msg.msg_info['msg_type'] == 'options': - keyboard = [[InlineKeyboardButton(option_text[:self.MAX_OPTION_LEN], - callback_data=urllib.parse.unquote(option_data))] - for (option_text, option_data, output_score) in response_msg.msg_info['options']] + elif response_msg.msg_info["msg_type"] == "options": + keyboard = [ + [ + InlineKeyboardButton( + option_text[: self.MAX_OPTION_LEN], + callback_data=urllib.parse.unquote(option_data), + ) + ] + for ( + option_text, + option_data, + output_score, + ) in response_msg.msg_info["options"] + ] reply_markup = InlineKeyboardMarkup(keyboard) - update.message.reply_text(response_msg.text[:self.MAX_MSG_LEN], reply_markup=reply_markup) - elif response_msg.msg_info['msg_type'] == 'error': - error_msg = 'ERROR: NO RESULT!' + update.message.reply_text( + response_msg.text[: self.MAX_MSG_LEN], reply_markup=reply_markup + ) + elif response_msg.msg_info["msg_type"] == "error": + error_msg = "ERROR: NO RESULT!" if update.message is not None: update.message.reply_text(error_msg) elif update.callback_query.message is not None: update.callback_query.message.reply_text(error_msg) else: - raise Exception('The msg_type is not recognized:', response_msg.msg_info['msg_type']) + raise Exception( + "The msg_type is not recognized:", response_msg.msg_info["msg_type"] + ) except Exception: traceback.print_exc() @@ -173,10 +222,9 @@ def send_msg(self, chat_id, msg_text): def run(self): """Starting the bot!""" - self.logger.info('Running the Telegram bot!') + self.logger.info("Running the Telegram bot!") self.updater.start_polling() # Run the bot until you press Ctrl-C or the process receives SIGINT, # SIGTERM or SIGABRT. This should be used most of the time, since # start_polling() is non-blocking and will stop the bot gracefully. self.updater.idle() - diff --git a/macaw/live_main.py b/macaw/live_main.py index bf1ec30..778a863 100644 --- a/macaw/live_main.py +++ b/macaw/live_main.py @@ -3,12 +3,14 @@ Authors: Hamed Zamani (hazamani@microsoft.com) """ +import argparse +from typing import List +from core.interaction_handler import Message +from core.response.punctuation import ResponseGeneratorPunctuation from macaw.cis import CIS from macaw.core import mrc, retrieval -from macaw.core.input_handler.action_detection import RequestDispatcher -from macaw.core.output_handler import naive_output_selection -from macaw.util.logging import Logger +from macaw.util.custom_logging import LoggerFactory class ConvQA(CIS): @@ -24,66 +26,113 @@ def __init__(self, params): for more information on the required parameters. """ super().__init__(params) - self.logger = params['logger'] - self.logger.info('Conversational QA Model... starting up...') - self.retrieval = retrieval.get_retrieval_model(params=self.params) - self.qa = mrc.get_mrc_model(params=self.params) - self.params['actions'] = {'retrieval': self.retrieval, 'qa': self.qa} - self.request_dispatcher = RequestDispatcher(self.params) - self.output_selection = naive_output_selection.NaiveOutputProcessing({}) - - def request_handler_func(self, conv_list): + self.logger.info("Conversational QA Model... starting up...") + + def generate_actions(self) -> dict: + return { + "retrieval": retrieval.get_retrieval_model(params=self.params), + "qa": mrc.get_mrc_model(params=self.params) + } + + def request_handler_func(self, conv_list: List[Message]) -> Message: """ - This function is called for each conversational interaction made by the user. In fact, this function calls the - dispatcher to send the user request to the information seeking components. + This function is called for each conversational interaction made by the user. It calls the NLP pipeline, DST + model and all the actions and saves their result in the latest conversation message. Args: - conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the + conv_list(list): List of Message, each corresponding to a conversational message from / to the user. This list is in reverse order, meaning that the first elements is the last interaction made by user. Returns: output_msg(Message): Returns an output message that should be sent to the UI to be presented to the user. + It is the latest conversation message. """ self.logger.info(conv_list) + + self.nlp_pipeline.run(conv_list) + self.logger.info(f"nlp pipeline result: {conv_list[0].nlp_pipeline_result}") + + # Run the DST module and save the output in conversation. + nlp_pipeline_output = conv_list[0].nlp_pipeline_result + self.dialogue_manager.process_turn(nlp_pipeline_output) + conv_list[0].dialog_manager = self.dialogue_manager + dispatcher_output = self.request_dispatcher.dispatch(conv_list) + output_msg = self.output_selection.get_output(conv_list, dispatcher_output) - return output_msg + + # Run response generator. + models_to_run = self.response_generator_handler.models_selector(conv_list) + rg_output = self.response_generator_handler.run_models(models_to_run, conv_list) + self.logger.info(f"rg_output: {rg_output}") + + # TODO: Select or create response from RG output. + + conv_list[0].msg_info = output_msg.msg_info + conv_list[0].response = output_msg.response + conv_list[0].timestamp = output_msg.timestamp + conv_list[0].actions_result = rg_output + + return conv_list[0] def run(self): """ - This function is called to run the ConvQA system. In live mode, it never stops until the program is killed. + This function is called to run the ConvQA system. In live mode, it never stops until the program is killed. """ self.interface.run() -if __name__ == '__main__': - basic_params = {'timeout': 15, # timeout is in terms of second. - 'mode': 'live', # mode can be either live or exp. - 'logger': Logger({})} # for logging into file, pass the filepath to the Logger class. +if __name__ == "__main__": + # Parse input arguments. + parser = argparse.ArgumentParser(description="Run live_main.py file") + parser.add_argument("--mode", type=str, default="live", help="live or exp (experimental)") + parser.add_argument("--interface", type=str, default="stdio", help="can be 'telegram' or 'stdio' for live mode, and" + " 'fileio' for exp mode") + args = parser.parse_args() + + basic_params = { + "timeout": 15, # timeout is in terms of second. + "mode": args.mode, # mode can be either live or exp. + } + + # for logging into file, pass the filepath to the Logger class. + my_logger = LoggerFactory.create_logger({}) # These are required database parameters if the mode is 'live'. The host and port of the machine hosting the # database, as well as the database name. - db_params = {'interaction_db_host': 'localhost', - 'interaction_db_port': 27017, - 'interaction_db_name': 'macaw_test'} + db_params = { + "interaction_db_host": "localhost", + "interaction_db_port": 27017, + "interaction_db_name": "macaw_test", + } # These are interface parameters. They are interface specific. - interface_params = {'interface': 'telegram', # interface can be 'telegram' or 'stdio' for live mode, and 'fileio' - # for exp mode. - 'bot_token': 'YOUR_TELECGRAM_BOT_TOKEN', # Telegram bot token. - 'asr_model': 'google', # The API used for speech recognition. - 'asg_model': 'google', # The API used for speech generation. - 'google-speech-to-text-credential-file': 'YOUR_GOOGLE_CREDENTIAL_FILE'} + interface_params = { + "interface": args.interface, # interface can be 'telegram' or 'stdio' for live mode, and 'fileio' + + # for experimental mode with fileio interface. + "input_file_path": "/usr/src/app/data/file_input.txt", + "output_file_path": "/usr/src/app/data/file_output.txt", + + "verbose_output_file_path": "/usr/src/app/data/file_output_verbose.txt", + "output_format": "text", + "bot_token": "YOUR_TELEGRAM_BOT_TOKEN", # Telegram bot token. + # 'asr_model': 'google', # The API used for speech recognition. + # 'asg_model': 'google', # The API used for speech generation. + "google-speech-to-text-credential-file": "YOUR_GOOGLE_CREDENTIAL_FILE", + } # These are parameters used by the retrieval model. - retrieval_params = {'query_generation': 'simple', # the model that generates a query from a conversation history. - 'use_coref': True, # True, if query generator can use coreference resolution, otherwise False. - 'search_engine': 'bing', # the search engine. It can be either 'indri' or 'bing'. - 'bing_key': 'YOUR_BING_SUBSCRIPTION_KEY', # Bing API key - 'search_engine_path': 'PATH_TO_INDRI', # The path to the indri toolkit. - 'col_index': 'PATH_TO_INDRI_INDEX', # The path to the indri index. - 'col_text_format': 'trectext', # collection text format. Standard 'trectext' is only supported. - 'results_requested': 3} # Maximum number of docs that should be retrieved by search engine. + retrieval_params = { + "query_generation": "simple", # the model that generates a query from a conversation history. + "use_coref": True, # True, if query generator can use coreference resolution, otherwise False. + "search_engine": "tantivy", # the search engine. + "bing_key": "YOUR_BING_SUBSCRIPTION_KEY", # Bing API key + "search_engine_path": "tantivy_index/", # The path to the tantivy index. + "col_index": "/usr/src/app/indri-5.11/buildindex/my_index", # The path to the indri index. + "col_text_format": "trectext", # collection text format. Standard 'trectext' is only supported. + "results_requested": 3, + } # Maximum number of docs that should be retrieved by search engine. # Note: If you want to have a re-ranking model (e.g., learning to rank), you just need to simply extend the class # core.retrieval.search_engine.ReRanker and implement the method 'rerank'. Then simply add a 'reranker' parameter to # retrieval_params that points to an instance of your favorite ReRanker class. If there is a 'reranker' parameter in @@ -91,13 +140,34 @@ def run(self): # 'get_results' in class core.retrieval.search_engine.Retrieval. # These are parameters used by the machine reading comprehension model. - mrc_params = {'mrc': 'drqa', # MRC model. - 'mrc_model_path': 'PATH_TO_PRETRAINED_MRC_MODEL', # The path to the model parameters. - 'mrc_path': 'PATH_TO_MRC_DIRECTORY', # The path to the model toolkit. - 'corenlp_path': 'PATH_TO_STANFORD_CORE_NLP_DIRECTORY', # The path to the corenlp toolkit. - 'qa_results_requested': 3} # The number of candidate answers returned by the MRC model. - - params = {**basic_params, **db_params, **interface_params, **retrieval_params, **mrc_params} - basic_params['logger'].info(params) + mrc_params = { + "mrc": "drqa", # MRC model. + "mrc_model_path": "/usr/src/app/DrQA/data/reader/multitask.mdl", # The path to the model parameters. + "mrc_path": "/usr/src/app/DrQA", # The path to the model toolkit. + "corenlp_path": "/usr/src/app/stanford-corenlp-full-2017-06-09", # The path to the corenlp toolkit. + "qa_results_requested": 3, + } # The number of candidate answers returned by the MRC model. + + # ML modules which are part of the NLP pipeline and all response generators. + ml_modules = { + "nlp_models": { + "intent_classification": "http://nlp-pipeline-app-ic:80", + "sample_flask": "http://nlp-pipeline-app-flask:80", + }, + "response_generator_models": { + "qa": "http://response-generator-app-qa:80", + "punctuation": ResponseGeneratorPunctuation(name="punctuation_model") + } + } + + params = { + **basic_params, + **db_params, + **interface_params, + **retrieval_params, + **mrc_params, + **ml_modules + } + + my_logger.info(params) ConvQA(params).run() - diff --git a/macaw/util/__init__.py b/macaw/util/__init__.py index 5e6e731..07aaaf4 100644 --- a/macaw/util/__init__.py +++ b/macaw/util/__init__.py @@ -5,22 +5,15 @@ """ import json -import time +import backoff as backoff from stanfordcorenlp import StanfordCoreNLP -def current_time_in_milliseconds(): - """ - A method that returns the current time in milliseconds. - - Returns: - An int representing the current time in milliseconds. - """ - return int(round(time.time() * 1000)) - - class NLPUtil: + @backoff.on_exception(backoff.expo, + Exception, + max_tries=3) def __init__(self, params): """ A simple NLP helper class. @@ -29,11 +22,15 @@ def __init__(self, params): params(dict): A dict containing some parameters. """ self.params = params - self.corenlp = StanfordCoreNLP(self.params['corenlp_path'], quiet=False) + self.corenlp = StanfordCoreNLP(self.params["corenlp_path"], quiet=False) # Pre-fetching the required models. - props = {'annotators': 'coref', 'pipelineLanguage': 'en', 'ner.useSUTime': False} - self.corenlp.annotate('', properties=props) + props = { + "annotators": "coref", + "pipelineLanguage": "en", + "ner.useSUTime": False, + } + self.corenlp.annotate("", properties=props) def get_coref(self, text): """ @@ -44,7 +41,11 @@ def get_coref(self, text): Returns: A json object containing all co-reference resolutions extracted from the input text. """ - props = {'annotators': 'coref', 'pipelineLanguage': 'en', 'ner.useSUTime': False} + props = { + "annotators": "coref", + "pipelineLanguage": "en", + "ner.useSUTime": False, + } result = json.loads(self.corenlp.annotate(text, properties=props)) return result diff --git a/macaw/util/custom_logging.py b/macaw/util/custom_logging.py new file mode 100644 index 0000000..7ac0da7 --- /dev/null +++ b/macaw/util/custom_logging.py @@ -0,0 +1,28 @@ +""" +The internal logger. + +Authors: Hamed Zamani (hazamani@microsoft.com) +""" + +import logging + + +class LoggerFactory: + @staticmethod + def create_logger(params) -> logging.Logger: + """ + Creates a new custom logger with configuration passed in params dictionary. Supported options are + 'logger_name', 'logger_level', and 'logger_file'. + """ + logger = logging.getLogger(params.get("logger_name", "MacawLogger")) + logger.setLevel(params.get("logger_level", logging.DEBUG)) + + if "logger_file" in params: + handler = logging.FileHandler(params["logger_file"]) + else: + handler = logging.StreamHandler() + + formatter = logging.Formatter('%(name)s - %(asctime)s - %(levelname)s - %(message)s') + handler.setFormatter(formatter) + logger.addHandler(handler) + return logger diff --git a/macaw/util/logging.py b/macaw/util/logging.py deleted file mode 100644 index 156ae77..0000000 --- a/macaw/util/logging.py +++ /dev/null @@ -1,28 +0,0 @@ -""" -The internal logger. - -Authors: Hamed Zamani (hazamani@microsoft.com) -""" - -import logging - - -class Logger(logging.Logger): - def __init__(self, params): - """ - A simple logging class, inherited from the standard logging.Logger. - - Args: - params(dict): A dict containing some parameters. 'logging_file' is an optional parameter, otherwise STDIO - will be used for logging. - """ - super().__init__('Macaw Logger') - self.params = params - if 'logging_file' in params: - self.handler_ = logging.FileHandler(params['logging_file']) - else: - self.handler_ = logging.StreamHandler() - - self.format = logging.Formatter('%(name)s - %(asctime)s - %(levelname)s - %(message)s') - self.handler_.setFormatter(self.format) - self.addHandler(self.handler_) diff --git a/macaw/util/text_parser.py b/macaw/util/text_parser.py index 8493bc3..5ae3205 100644 --- a/macaw/util/text_parser.py +++ b/macaw/util/text_parser.py @@ -4,9 +4,10 @@ Authors: Hamed Zamani (hazamani@microsoft.com) """ -import justext from xml.etree import cElementTree as ElementTree +import justext + class XmlListConfig(list): def __init__(self, aList): @@ -88,6 +89,7 @@ def xml_file_to_dict(xml_file): # result = filter(visible, data) # return ' '.join(result) + def html_to_clean_text(html): """ Converting an HTML document to clean text. @@ -102,4 +104,4 @@ def html_to_clean_text(html): for paragraph in paragraphs: if not paragraph.is_boilerplate: clean_text_list.append(paragraph.text) - return '\n'.join(clean_text_list) + return "\n".join(clean_text_list) diff --git a/macaw/wizard_of_oz_main.py b/macaw/wizard_of_oz_main.py index 08331d5..7324bfb 100644 --- a/macaw/wizard_of_oz_main.py +++ b/macaw/wizard_of_oz_main.py @@ -3,15 +3,15 @@ Authors: Hamed Zamani (hazamani@microsoft.com) """ - +import logging import multiprocessing from macaw import interface from macaw.core import retrieval -from macaw.core.input_handler.action_detection import RequestDispatcher +from macaw.core.response.action_detection import RequestDispatcher from macaw.core.interaction_handler.user_requests_db import InteractionDB from macaw.core.output_handler import naive_output_selection -from macaw.util.logging import Logger +from macaw.util.custom_logging import LoggerFactory class Seeker: @@ -27,7 +27,7 @@ def __init__(self, params): for more information on the required parameters. """ self.params = params - self.logger = params['logger'] + self.logger = logging.getLogger("MacawLogger") self.logger.info('Conversational Wirzard of Oz System... starting up...') self.wizard = None self.params['live_request_handler'] = self.live_request_handler @@ -87,7 +87,7 @@ def __init__(self, params): for more information on the required parameters. """ self.params = params - self.logger = params['logger'] + self.logger = logging.getLogger("MacawLogger") self.logger.info('Conversational Wirzard of Oz System... starting up...') self.params['live_request_handler'] = self.live_request_handler self.seeker = None @@ -148,9 +148,9 @@ def run(self): if __name__ == '__main__': + my_logger = LoggerFactory.create_logger({}) basic_params = {'timeout': 15, # timeout is in terms of second. - 'mode': 'live', # mode can be either live or exp. - 'logger': Logger({})} # for logging into file, pass the filepath to the Logger class. + 'mode': 'live'} # mode can be either live or exp. # These are required database parameters if the mode is 'live'. The host and port of the machine hosting the # database, as well as the database name. @@ -190,8 +190,8 @@ def run(self): seeker_params = {**basic_params, **db_params, **seeker_interface_params, **retrieval_params} wizard_params = {**basic_params, **db_params, **wizard_interface_params, **retrieval_params} - basic_params['logger'].info(seeker_params) - basic_params['logger'].info(wizard_params) + my_logger.info(seeker_params) + my_logger.info(wizard_params) seeker = Seeker(seeker_params) wizard = Wizard(wizard_params) @@ -204,6 +204,6 @@ def run(self): seeker_process.start() wizard_process.start() - basic_params['logger'].info('Seeker Process ID: {}'.format(seeker_process.pid)) - basic_params['logger'].info('Wizard Process ID: {}'.format(wizard_process.pid)) + my_logger.info('Seeker Process ID: {}'.format(seeker_process.pid)) + my_logger.info('Wizard Process ID: {}'.format(wizard_process.pid)) diff --git a/requirements.txt b/requirements.txt index 4a068f5..856df75 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,7 @@ pydub python-telegram-bot==12.0.0 stanfordcorenlp google-cloud-texttospeech +tantivy +pymongo +backoff +deepmultilingualpunctuation diff --git a/schema.ts b/schema.ts new file mode 100644 index 0000000..0e1c16f --- /dev/null +++ b/schema.ts @@ -0,0 +1,21 @@ +interface Message { + user_interface: string, + user_id: string | number, // number as in integer + text: string, + timestamp: Date, // datetime.utcnow() + user_info?: {[user_detail: string]: any}, + msg_info?: {[msg_detail: string]: any}, + actions: { + [action: string]: string | {[action_var: string]: any} + }, + dialog_state_tracking: { + state: string, + [dialog_state_tracking_var: string]: string + }, + nlp_pipeline: { + [module: string]: {[module_var: string]: any} + } + user_attributes: { + [attribute: string]: any + } +} \ No newline at end of file diff --git a/scripts/start.sh b/scripts/start.sh new file mode 100644 index 0000000..a9f61dd --- /dev/null +++ b/scripts/start.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +echo "Inside the execution script." + +# Start MongoDB as a daemon process https://docs.mongodb.com/manual/tutorial/manage-mongodb-processes/ +mongod --fork --logpath /var/log/mongodb/mongod.log +echo "MongoDB started successfully." + +echo "This is the python version: $(python3 --version)" + +# Start the main application. +python3 macaw/live_main.py --mode exp --interface fileio +# Use the below command for terminal IO. +#python3 macaw/live_main.py --mode live --interface stdio diff --git a/setup.py b/setup.py index 8675bdc..c9a82ac 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ setup( name='macaw', version='0.1', - packages=['macaw', 'macaw.core', 'macaw.core.mrc', 'macaw.core.retrieval', 'macaw.core.input_handler', + packages=['macaw', 'macaw.core', 'macaw.core.mrc', 'macaw.core.retrieval', 'macaw.core.response', 'macaw.core.output_handler', 'macaw.core.interaction_handler', 'macaw.util', 'macaw.interface'], url='https://github.com/microsoft/macaw/', license='MIT', diff --git a/trec_documents/.DS_Store b/trec_documents/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/trec_documents/.DS_Store differ diff --git a/trec_documents/docset.trectext b/trec_documents/docset.trectext new file mode 100644 index 0000000..67ba521 --- /dev/null +++ b/trec_documents/docset.trectext @@ -0,0 +1,94 @@ + + XIE19960101.0001 + 1996-01-01 20:06 + + Yearender: Jordanian Diplomacy Witnesses Great Achievements in 1995 (part two) + +The normalization with Egypt, an active supporter of the +anti-Iraq coalition during the Gulf crisis, was symbolized by the +visit of Egyptian President Hosni Mubarak to Jordan last January. +At the same time, Jordan improved its ties with the +Palestinians by dispelling their fears that arouse among the +Palestinians due to Jordan's special role in Jerusalem under the +Jordanian-Israeli accord. +In addition, Jordan also made efforts to push forward the +Palestinian-Israeli negotiations on expanding Palestinian autonomy +in the West Bank. +To its own benefits, Jordan's peace treaty with Israel and new +attitude toward Baghdad have also brought about warmer ties with +the Western countries, with the United States in particular. +Among others, U.S. Vice President Al Gore, British Prime +Minister John Major, German Chancellor Helmut Kohl, Japanese Prime +Minister Tomiichi Murayama as well as Spanish Prime Minister +Felipe Gonzalez, visited Amman and reiterated their countries' +support for the kingdom in various fields. +U.S. Secretary of State Warren Christopher came to Jordan time +and again to meet with King Hussein during his more than a dozen +of shuttle visits to the region to expedite the ongoing Middle +East peace process. +The Clinton Administration also voiced its full support, without +reservation, to the kingdom to defend its security immediately +after Iraqi former Industry Minister Kamel Hassan defected to +Jordan in August, warning against a threat of Iraqi retaliation +upon Jordan. +In the past year, King Hussein, Crown Prince Hassan and Prime +Minister Sharif Zeid Ben Shaker as well as other senior Jordanian +officials traveled to London, Paris, Tokyo, Bonn and Washington, +bringing home agreements for economic aid or military assistance +as well as promises of support. +Its success of hosting the second Middle East and North Africa +economic summit in Amman, which drew the participation of 1,600 +government officials and private businessmen from 62 countries +across the world, added great credit to the kingdom and was considered +the crowning accomplishment of Jordan's foreign policy. + + + + + XIE19960101.0002 + 1996-01-01 20:58 + + Yearender: Jordanian Diplomacy Witnesses Great Achievements in 1995 (by Wen Xinnian) (part one) + +AMMAN, January 1 (Xinhua) -- Jordan, in the past year, +witnessed a series of remarkable achievements on its diplomatic +front. +In the year, the kingdom restored its territorial and water +rights from Israel, normalized relations with the Jewish state, +improved ties with other Arab states and moved closer to the +Western countries, especially the United States. +The historic Israel-Jordan peace treaty signed in October, +1994, which ended 46 years of enmity between them, heralded a new +era for Jordan's foreign policy and was consequently lauded as a +model of peace in the region. +In 1995, Jordan regained sovereignty over its territories +occupied by Israel and exchanged ambassadors with Tel Aviv. It +also signed a number of agreements with Israel, expanding peace to +economic cooperation. +Its grant of political asylum to General Hussein Kamel Hassan, +son-in-law of President Saddam Hussein, and its growing +estrangement from Iraq, have quickened the pace to improve its +relations with the Gulf Arab states, which were strained because +of Jordan's tilt toward Iraq during the 1990-1991 Gulf crisis. +In the past year, the kingdom also saw an outstanding warm-up +of ties with Saudi Arabia, the strongest of the six-nation Gulf +Cooperation Council (GCC). +Jordan's Foreign Minister Abdul-Karim Kabariti visited Riyadh +twice during the year and was received by Saudi King Fahd on his +second visit to the oil-rich Gulf kingdom. +Saudi Ambassador to Jordan Abdullah Sudairi arrived in Amman +last month and took his post five years after Saudi Arabia +recalled its ambassador in Amman, marking the return to normal of +the bilateral relations. +A visit to Saudi Arabia by King Hussein and his summit meeting +with King Fahd, which had been scheduled for early this month but +were canceled due to Fahd's sudden illness, are expected to +materialize soon. +Kuwait, invaded by Iraq in 1990, has not formally announced its +normalization with Jordan, but has been moving closer to it. +The two countries' foreign ministers met repeatedly on the +sidelines of many regional and international gatherings and +agreed to conducting regular coordination and consultation. + + +