From 2115f41b1af00867bc060631d55bffedc3a6845b Mon Sep 17 00:00:00 2001 From: Amit Hattimare Date: Tue, 25 May 2021 15:58:54 -0400 Subject: [PATCH 01/10] Integrating Docker to enable running Macaw in multiple OS. Added Docker script, updated some code and README file to make it working for stdio interface using Indri and DrQA. --- .gitignore | 4 + Dockerfile | 95 +++++++++++++++++++ README.md | 81 +++++++++++++++- macaw/cis.py | 2 +- macaw/core/input_handler/action_detection.py | 2 +- macaw/core/mrc/drqa_mrc.py | 4 - macaw/interface/interface.py | 2 +- macaw/interface/stdio.py | 7 +- macaw/live_main.py | 26 ++--- scripts/start.sh | 10 ++ trec_documents/.DS_Store | Bin 0 -> 6148 bytes trec_documents/docset.trectext | 94 ++++++++++++++++++ 12 files changed, 299 insertions(+), 28 deletions(-) create mode 100644 Dockerfile create mode 100644 scripts/start.sh create mode 100644 trec_documents/.DS_Store create mode 100644 trec_documents/docset.trectext diff --git a/.gitignore b/.gitignore index 894a44c..f3c7b2a 100644 --- a/.gitignore +++ b/.gitignore @@ -102,3 +102,7 @@ venv.bak/ # mypy .mypy_cache/ + +# PyCharm stuff: +.idea + diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..87c0943 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,95 @@ +FROM ubuntu:16.04 +MAINTAINER ahattimare@umass.edu + +WORKDIR /usr/src/app + +# Disable interactive input. This is needed to install mongodb. +ENV DEBIAN_FRONTEND noninteractive + +RUN apt-get update && apt-get install -y \ + gnupg \ + wget \ + g++ \ + make \ + zlib1g-dev \ + software-properties-common \ + unzip \ + default-jre \ + git \ + apt-transport-https ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +# Download Indri search engine for document retrieval. +RUN wget --no-check-certificate https://sourceforge.net/projects/lemur/files/lemur/indri-5.11/indri-5.11.tar.gz \ + && tar -xvzf indri-5.11.tar.gz \ + && rm indri-5.11.tar.gz + +# Install Indri. +RUN cd indri-5.11 \ + && ./configure CXX="g++ -D_GLIBCXX_USE_CXX11_ABI=0" \ + && make \ + && make install + +# Install pip but not to the latest version as it does not support pyndri installation due to Python 2.7 incompatibility. +# https://stackoverflow.com/questions/65896334 +RUN apt-get update && apt-get install -y python3-pip \ + && pip3 install --upgrade "pip < 21.0" + +RUN pip3 install pyndri + +RUN apt update && apt install -y ffmpeg + +# Install all dependencies mentioned in the macaw requirements document. +COPY requirements.txt requirements.txt +RUN pip3 install -r requirements.txt + +RUN pip3 install torch + +# Download Stanford core NLP data if user has not specified a local volume. This is a 400MB compressed file. +ARG download_stanford_corenlp=false +RUN if $download_stanford_corenlp ; then \ + wget -O "stanford-corenlp-full-2017-06-09.zip" "http://nlp.stanford.edu/software/stanford-corenlp-full-2017-06-09.zip" \ + && unzip "stanford-corenlp-full-2017-06-09.zip" \ + && rm "stanford-corenlp-full-2017-06-09.zip" ; fi + +# Download and install FAIR DrQA module. +RUN git clone https://github.com/facebookresearch/DrQA.git +RUN cd DrQA \ + && pip3 install -r requirements.txt \ + && python3 setup.py develop + +# Download a pre-trained DrQA model if user has not specified a local volume. This is a 7.5GB compressed file download +# and requires 25GB of uncompressed space. To save Docker image memory, it is recommended to use external file as volume. +ARG download_drqa_model=false +RUN if $download_drqa_model ; then cd DrQA && ./download.sh ; fi + +# Install MongoDB server https://docs.mongodb.com/manual/tutorial/install-mongodb-on-ubuntu/ +RUN wget -qO - https://www.mongodb.org/static/pgp/server-4.4.asc | apt-key add - +RUN echo "deb [ arch=amd64,arm64 ] https://repo.mongodb.org/apt/ubuntu xenial/mongodb-org/4.4 multiverse" | tee /etc/apt/sources.list.d/mongodb-org-4.4.list +RUN apt-get update && apt-get install -y mongodb-org + +# Create the MongoDB data directory. +RUN mkdir -p /data/db + +# Copy directory that contains trectext documents needed for Indri retriever. +COPY trec_documents trec_documents + +# Build indri index in directory /usr/src/app/indri-5.11/buildindex/my_index. Not needed if running with Bing. +ARG use_indri_retriever=true +RUN if $use_indri_retriever ; then cd indri-5.11/buildindex \ + && ./IndriBuildIndex -memory=100M -corpus.class=trectext -index=my_index \ + -stemmer.name=Krovetz -corpus.path=/usr/src/app/trec_documents \ + ; fi + +# Copy files and directories from workspace to Docker container. +COPY macaw macaw +COPY scripts scripts +COPY setup.py setup.py + +ENV PYTHONPATH="$PYTHONPATH:/usr/src/app" + +# Install Macaw. +RUN python3 setup.py install + +# Run the script that will start MongoDB and run python application. +CMD ["/bin/bash", "scripts/start.sh"] diff --git a/README.md b/README.md index c6e6d06..b4c7a5e 100644 --- a/README.md +++ b/README.md @@ -55,8 +55,79 @@ For question answering, Macaw only features [the DrQA model](https://github.com/ version. -## Installation -Macaw requires `Python >= 3.6` and `pip3`. If you don't have `setuptools`, run `sudo pip3 install setuptools`. +## Installation and running with Docker +The package has been tested with certain dependencies and it is much easier to reproduce it in a similar environment. It has +been integrated with Docker to make it compatible with all operating systems. The default Docker setup runs the application +using the Standard IO interface, uses Indri for document retrieval, and DrQA for MRC (answer selection). To run using other +settings, appropriate changes should be done. + +The first step is to install [Docker](https://docs.docker.com/engine/install/) in your system. Then continue with the below +steps. + +#### Create the build +To reduce the size of the build, we can keep certain data outside the Docker container and mount it using [volumes](https://docs.docker.com/storage/volumes/). +1. Download the Stanford Core NLP data from [here](http://nlp.stanford.edu/software/stanford-corenlp-full-2017-06-09.zip) and +put the directory `stanford-corenlp-full-2017-06-09` in your project root directory. +1. Install [DrQA](https://github.com/facebookresearch/DrQA) in a separate workspace and download the pre-trained model. It +stores the models in `data/reader/` directory. We will use the downloaded *multitask.mdl* model. + +Once you have the two downloads done, run the below command from project root to create a docker build with name *macaw*: +```commandline +docker build -t macaw . +``` + +If you don't want to pre-install DrQA model and Stanford Core NLP data, create the build using the below command. It will +install both dependencies for you and keep them as part of the build. Note that this will significantly increase the build +size (by ~400MB for Stanford CoreNLP and by ~7.5GB for DrQA). +```commandline +docker build --build-arg download_stanford_corenlp=true --build-arg download_drqa_model=true -t macaw . +``` + +#### Run the application +If you downloaded certain data locally, then use Docker volumes to mount local directory to Docker container. You need to +provide the local directory path during runtime. Run the command from project root. +```commandline +docker run --rm -i --name=macaw_test_container \ +-v :/usr/src/app/DrQA/data \ +-v $("pwd")/stanford-corenlp-full-2017-06-09:/usr/src/app/stanford-corenlp-full-2017-06-09 \ +macaw +``` +`` could be `/Users/amitgh/PycharmProjects/DrQA/data` if you downloaded the pre-trained model +in a separate workspace named DrQA. + +If you did not separately download data at build time, simply run: +```commandline +docker run --rm -i --name=macaw_test_container macaw +``` + +In above command we start a container with name *macaw_test_container* from build image *macaw* in interactive mode (`-i`) +and remove the container when the application exits (`--rm`). After installing all dependencies, it runs `scripts/start.sh` +which first starts MongoDB server in a separate thread and then runs `live_main.py`. + +#### ssh into the container +While the application is running, we can go inside the container to see the contents (directory structure, indri index, etc.). +```commandline +docker exec -it macaw_test_container /bin/bash +``` + +#### Updating TREC data for Indri +Indri index is created using the document stored in `trec_documents/` directory. It has some default data. To create a bigger +index, download the entire data from [archive](https://archive.org/details/trec-ir) and put it in trec_documents. Docker will +copy it during build time and create a new index. Index creation parameters like *-memory*, *-stemmer.name* can be changed +inside the Dockerfile. + +### A note on dependencies +Pyndri is tested only with [Indri-5.11](https://sourceforge.net/projects/lemur/files/lemur/indri-5.11/) which in turn only +supports certain OS. We use Ubuntu-16.04 in our Docker container. Ubuntu 18.04 creates issues during indri index creation +because of some unsupported C++11 functions. *pip* version <21.0 is needed to properly install pyndri. + + +## Local Setup + +To setup the package locally without using Docker, follow the below instructions. + +### Installation +Macaw requires `Python >= 3.5` and `pip3`. If you don't have `setuptools`, run `sudo pip3 install setuptools`. To install Macaw, first **clone macaw** from this repo and then follow the following installation steps. The mentioned installation commands can be executed on Ubuntu. You can use the same or similar commands on other Linux distribution. If you are using Windows 10, we recommend installing Macaw and all the required packages on @@ -128,8 +199,6 @@ To use pre-trained DrQA model, use the following command. ./download.sh ``` This downloads a 7.5GB (compressed) file and requires 25GB (uncompressed) space. This may take a while! - - #### Step 5: Installing FFmpeg To support speech interactions with users, Macaw requires FFmpeg for some multimedia processing steps. If you don't @@ -146,7 +215,7 @@ sudo pip3 install -r requirements.txt sudo python3 setup.py install ``` -## Running Macaw +### Running Macaw If you run macaw with interactive (or live) mode, you should first run MongoDB server using the following command: ``` sudo mongod @@ -176,6 +245,7 @@ python3 live_main.py For bug report and feature request, you can open an issue in github, or send an email to [Hamed Zamani](http://hamedz.ir) at `hazamani@microsoft.com`. + ## Citation If you found Macaw useful, you can cite the following article: ``` @@ -192,6 +262,7 @@ bibtex: } ``` + ## License Macaw is distributed under the **MIT License**. See the `LICENSE` file for more information. diff --git a/macaw/cis.py b/macaw/cis.py index a36ab54..8954c31 100644 --- a/macaw/cis.py +++ b/macaw/cis.py @@ -35,7 +35,7 @@ def __init__(self, params): self.nlp_util = util.NLPUtil(self.params) self.params['nlp_util'] = self.nlp_util except Exception as ex: - self.params['logger'].warning('WARNING: There is a problem with setting up the NLP utility module.') + self.params['logger'].warning('WARNING: There is a problem with setting up the NLP utility module. ' + str(ex)) self.timeout = self.params['timeout'] if 'timeout' in self.params else -1 def live_request_handler(self, msg): diff --git a/macaw/core/input_handler/action_detection.py b/macaw/core/input_handler/action_detection.py index a49439f..b7e8743 100644 --- a/macaw/core/input_handler/action_detection.py +++ b/macaw/core/input_handler/action_detection.py @@ -107,7 +107,7 @@ def dispatch(self, conv_list): p.join() candidate_outputs = dict() - for key in action_results: + for key in action_results.keys(): if action_results[key]: candidate_outputs[key] = action_results[key] return candidate_outputs diff --git a/macaw/core/mrc/drqa_mrc.py b/macaw/core/mrc/drqa_mrc.py index 8c72659..b3e5c4a 100644 --- a/macaw/core/mrc/drqa_mrc.py +++ b/macaw/core/mrc/drqa_mrc.py @@ -78,7 +78,3 @@ def get_results(self, conv_list, doc): for i, p in enumerate(predictions, 1): results.append(Document(None, None, p[0], p[1])) return results - - - - diff --git a/macaw/interface/interface.py b/macaw/interface/interface.py index 63312ab..e82ec8a 100644 --- a/macaw/interface/interface.py +++ b/macaw/interface/interface.py @@ -17,4 +17,4 @@ def run(self): @abstractmethod def result_presentation(self, response_msg, params): - pass \ No newline at end of file + pass diff --git a/macaw/interface/stdio.py b/macaw/interface/stdio.py index aca7aa0..ffab707 100644 --- a/macaw/interface/stdio.py +++ b/macaw/interface/stdio.py @@ -20,7 +20,9 @@ def __init__(self, params): def run(self): while True: try: - request = input('ENTER YOUR COMMAND: ').strip() + request = input('ENTER YOUR COMMAND (type \'exit\' to leave): ').strip() + if request == 'exit': + break if len(request) == 0: continue user_info = {'first_name': 'STDIO', @@ -55,7 +57,6 @@ def result_presentation(self, response_msg, params): else: raise Exception('The msg_type is not recognized:', response_msg.msg_info['msg_type']) print('----------------------------------------------------------------------') - print('THE RESPONSE STARTS') + print('THE RESPONSE ENDS') except Exception as ex: traceback.print_exc() - diff --git a/macaw/live_main.py b/macaw/live_main.py index bf1ec30..8de33f9 100644 --- a/macaw/live_main.py +++ b/macaw/live_main.py @@ -68,20 +68,21 @@ def run(self): 'interaction_db_name': 'macaw_test'} # These are interface parameters. They are interface specific. - interface_params = {'interface': 'telegram', # interface can be 'telegram' or 'stdio' for live mode, and 'fileio' - # for exp mode. - 'bot_token': 'YOUR_TELECGRAM_BOT_TOKEN', # Telegram bot token. - 'asr_model': 'google', # The API used for speech recognition. - 'asg_model': 'google', # The API used for speech generation. - 'google-speech-to-text-credential-file': 'YOUR_GOOGLE_CREDENTIAL_FILE'} + interface_params = {'interface': 'stdio', # interface can be 'telegram' or 'stdio' for live mode, and 'fileio' + # for experimental mode. + 'bot_token': 'YOUR_TELEGRAM_BOT_TOKEN', # Telegram bot token. + # 'asr_model': 'google', # The API used for speech recognition. + # 'asg_model': 'google', # The API used for speech generation. + 'google-speech-to-text-credential-file': 'YOUR_GOOGLE_CREDENTIAL_FILE' + } # These are parameters used by the retrieval model. retrieval_params = {'query_generation': 'simple', # the model that generates a query from a conversation history. 'use_coref': True, # True, if query generator can use coreference resolution, otherwise False. - 'search_engine': 'bing', # the search engine. It can be either 'indri' or 'bing'. + 'search_engine': 'indri', # the search engine. It can be either 'indri' or 'bing'. 'bing_key': 'YOUR_BING_SUBSCRIPTION_KEY', # Bing API key - 'search_engine_path': 'PATH_TO_INDRI', # The path to the indri toolkit. - 'col_index': 'PATH_TO_INDRI_INDEX', # The path to the indri index. + 'search_engine_path': '/usr/src/app/indri-5.11', # The path to the indri toolkit. + 'col_index': '/usr/src/app/indri-5.11/buildindex/my_index', # The path to the indri index. 'col_text_format': 'trectext', # collection text format. Standard 'trectext' is only supported. 'results_requested': 3} # Maximum number of docs that should be retrieved by search engine. # Note: If you want to have a re-ranking model (e.g., learning to rank), you just need to simply extend the class @@ -92,12 +93,11 @@ def run(self): # These are parameters used by the machine reading comprehension model. mrc_params = {'mrc': 'drqa', # MRC model. - 'mrc_model_path': 'PATH_TO_PRETRAINED_MRC_MODEL', # The path to the model parameters. - 'mrc_path': 'PATH_TO_MRC_DIRECTORY', # The path to the model toolkit. - 'corenlp_path': 'PATH_TO_STANFORD_CORE_NLP_DIRECTORY', # The path to the corenlp toolkit. + 'mrc_model_path': '/usr/src/app/DrQA/data/reader/multitask.mdl', # The path to the model parameters. + 'mrc_path': '/usr/src/app/DrQA', # The path to the model toolkit. + 'corenlp_path': '/usr/src/app/stanford-corenlp-full-2017-06-09', # The path to the corenlp toolkit. 'qa_results_requested': 3} # The number of candidate answers returned by the MRC model. params = {**basic_params, **db_params, **interface_params, **retrieval_params, **mrc_params} basic_params['logger'].info(params) ConvQA(params).run() - diff --git a/scripts/start.sh b/scripts/start.sh new file mode 100644 index 0000000..9628fab --- /dev/null +++ b/scripts/start.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +echo "Inside the execution script." + +# Start MongoDB as a daemon process https://docs.mongodb.com/manual/tutorial/manage-mongodb-processes/ +mongod --fork --logpath /var/log/mongodb/mongod.log +echo "MongoDB started successfully." + +# Start the main application. +python3 macaw/live_main.py diff --git a/trec_documents/.DS_Store b/trec_documents/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 GIT binary patch literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0 + XIE19960101.0001 + 1996-01-01 20:06 + + Yearender: Jordanian Diplomacy Witnesses Great Achievements in 1995 (part two) + +The normalization with Egypt, an active supporter of the +anti-Iraq coalition during the Gulf crisis, was symbolized by the +visit of Egyptian President Hosni Mubarak to Jordan last January. +At the same time, Jordan improved its ties with the +Palestinians by dispelling their fears that arouse among the +Palestinians due to Jordan's special role in Jerusalem under the +Jordanian-Israeli accord. +In addition, Jordan also made efforts to push forward the +Palestinian-Israeli negotiations on expanding Palestinian autonomy +in the West Bank. +To its own benefits, Jordan's peace treaty with Israel and new +attitude toward Baghdad have also brought about warmer ties with +the Western countries, with the United States in particular. +Among others, U.S. Vice President Al Gore, British Prime +Minister John Major, German Chancellor Helmut Kohl, Japanese Prime +Minister Tomiichi Murayama as well as Spanish Prime Minister +Felipe Gonzalez, visited Amman and reiterated their countries' +support for the kingdom in various fields. +U.S. Secretary of State Warren Christopher came to Jordan time +and again to meet with King Hussein during his more than a dozen +of shuttle visits to the region to expedite the ongoing Middle +East peace process. +The Clinton Administration also voiced its full support, without +reservation, to the kingdom to defend its security immediately +after Iraqi former Industry Minister Kamel Hassan defected to +Jordan in August, warning against a threat of Iraqi retaliation +upon Jordan. +In the past year, King Hussein, Crown Prince Hassan and Prime +Minister Sharif Zeid Ben Shaker as well as other senior Jordanian +officials traveled to London, Paris, Tokyo, Bonn and Washington, +bringing home agreements for economic aid or military assistance +as well as promises of support. +Its success of hosting the second Middle East and North Africa +economic summit in Amman, which drew the participation of 1,600 +government officials and private businessmen from 62 countries +across the world, added great credit to the kingdom and was considered +the crowning accomplishment of Jordan's foreign policy. + + + + + XIE19960101.0002 + 1996-01-01 20:58 + + Yearender: Jordanian Diplomacy Witnesses Great Achievements in 1995 (by Wen Xinnian) (part one) + +AMMAN, January 1 (Xinhua) -- Jordan, in the past year, +witnessed a series of remarkable achievements on its diplomatic +front. +In the year, the kingdom restored its territorial and water +rights from Israel, normalized relations with the Jewish state, +improved ties with other Arab states and moved closer to the +Western countries, especially the United States. +The historic Israel-Jordan peace treaty signed in October, +1994, which ended 46 years of enmity between them, heralded a new +era for Jordan's foreign policy and was consequently lauded as a +model of peace in the region. +In 1995, Jordan regained sovereignty over its territories +occupied by Israel and exchanged ambassadors with Tel Aviv. It +also signed a number of agreements with Israel, expanding peace to +economic cooperation. +Its grant of political asylum to General Hussein Kamel Hassan, +son-in-law of President Saddam Hussein, and its growing +estrangement from Iraq, have quickened the pace to improve its +relations with the Gulf Arab states, which were strained because +of Jordan's tilt toward Iraq during the 1990-1991 Gulf crisis. +In the past year, the kingdom also saw an outstanding warm-up +of ties with Saudi Arabia, the strongest of the six-nation Gulf +Cooperation Council (GCC). +Jordan's Foreign Minister Abdul-Karim Kabariti visited Riyadh +twice during the year and was received by Saudi King Fahd on his +second visit to the oil-rich Gulf kingdom. +Saudi Ambassador to Jordan Abdullah Sudairi arrived in Amman +last month and took his post five years after Saudi Arabia +recalled its ambassador in Amman, marking the return to normal of +the bilateral relations. +A visit to Saudi Arabia by King Hussein and his summit meeting +with King Fahd, which had been scheduled for early this month but +were canceled due to Fahd's sudden illness, are expected to +materialize soon. +Kuwait, invaded by Iraq in 1990, has not formally announced its +normalization with Jordan, but has been moving closer to it. +The two countries' foreign ministers met repeatedly on the +sidelines of many regional and international gatherings and +agreed to conducting regular coordination and consultation. + + + From 803f41cda1cfd4f2920ba123daff82fc382ba7a3 Mon Sep 17 00:00:00 2001 From: Amit Hattimare Date: Fri, 20 May 2022 14:07:07 -0400 Subject: [PATCH 02/10] Fix dockerfile dependencies. - Using python 3.6 instead of 3.5 --- Dockerfile | 15 ++++++++++++++- scripts/start.sh | 2 ++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 87c0943..71a2656 100644 --- a/Dockerfile +++ b/Dockerfile @@ -30,6 +30,19 @@ RUN cd indri-5.11 \ && make \ && make install +# Install python 3.6 because 3.5 version is not compatible with new certifi changes. Gives error at runtime. +# https://github.com/certifi/python-certifi/issues/195 +# Before installing Python 3.6, install the below so that packages of requirements.txt are installed correctly later. +RUN apt-get update && apt-get install -y \ + build-essential checkinstall \ + libreadline-gplv2-dev libncursesw5-dev libssl-dev libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev libffi-dev +RUN wget "https://www.python.org/ftp/python/3.6.3/Python-3.6.3.tgz" \ + && tar -xvf Python-3.6.3.tgz \ + && cd Python-3.6.3 \ + && ./configure \ + && make \ + && make install + # Install pip but not to the latest version as it does not support pyndri installation due to Python 2.7 incompatibility. # https://stackoverflow.com/questions/65896334 RUN apt-get update && apt-get install -y python3-pip \ @@ -41,7 +54,7 @@ RUN apt update && apt install -y ffmpeg # Install all dependencies mentioned in the macaw requirements document. COPY requirements.txt requirements.txt -RUN pip3 install -r requirements.txt +RUN pip3 install -r requirements.txt --trusted-host pypi.python.org RUN pip3 install torch diff --git a/scripts/start.sh b/scripts/start.sh index 9628fab..f1034dc 100644 --- a/scripts/start.sh +++ b/scripts/start.sh @@ -6,5 +6,7 @@ echo "Inside the execution script." mongod --fork --logpath /var/log/mongodb/mongod.log echo "MongoDB started successfully." +echo "This is the python version: $(python3 --version)" + # Start the main application. python3 macaw/live_main.py From 14e7560431aa92eb817facc7496eb642fa852124 Mon Sep 17 00:00:00 2001 From: Amit Hattimare Date: Fri, 20 May 2022 14:22:36 -0400 Subject: [PATCH 03/10] minor changes. --- Dockerfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 71a2656..5cef241 100644 --- a/Dockerfile +++ b/Dockerfile @@ -52,11 +52,11 @@ RUN pip3 install pyndri RUN apt update && apt install -y ffmpeg +RUN pip3 install torch + # Install all dependencies mentioned in the macaw requirements document. COPY requirements.txt requirements.txt -RUN pip3 install -r requirements.txt --trusted-host pypi.python.org - -RUN pip3 install torch +RUN pip3 install -r requirements.txt # Download Stanford core NLP data if user has not specified a local volume. This is a 400MB compressed file. ARG download_stanford_corenlp=false From 0313b059c1884597e05e71db27bf5d6d26fc2ba6 Mon Sep 17 00:00:00 2001 From: Amit Hattimare Date: Wed, 25 May 2022 13:28:23 -0400 Subject: [PATCH 04/10] Remove .idea folder. (#3) Co-authored-by: Amit Hattimare --- .idea/.gitignore | 3 --- .idea/inspectionProfiles/profiles_settings.xml | 6 ------ .idea/macaw.iml | 17 ----------------- .idea/misc.xml | 7 ------- .idea/modules.xml | 8 -------- .idea/vcs.xml | 6 ------ 6 files changed, 47 deletions(-) delete mode 100644 .idea/.gitignore delete mode 100644 .idea/inspectionProfiles/profiles_settings.xml delete mode 100644 .idea/macaw.iml delete mode 100644 .idea/misc.xml delete mode 100644 .idea/modules.xml delete mode 100644 .idea/vcs.xml diff --git a/.idea/.gitignore b/.idea/.gitignore deleted file mode 100644 index 0e40fe8..0000000 --- a/.idea/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ - -# Default ignored files -/workspace.xml \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml deleted file mode 100644 index 105ce2d..0000000 --- a/.idea/inspectionProfiles/profiles_settings.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - \ No newline at end of file diff --git a/.idea/macaw.iml b/.idea/macaw.iml deleted file mode 100644 index 055624e..0000000 --- a/.idea/macaw.iml +++ /dev/null @@ -1,17 +0,0 @@ - - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml deleted file mode 100644 index d1151ac..0000000 --- a/.idea/misc.xml +++ /dev/null @@ -1,7 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml deleted file mode 100644 index 7021aec..0000000 --- a/.idea/modules.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - - \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml deleted file mode 100644 index 94a25f7..0000000 --- a/.idea/vcs.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file From 21b11348ff43f6942b570302477ec9c907b6f27b Mon Sep 17 00:00:00 2001 From: Arkin Dharawat Date: Wed, 1 Jun 2022 01:07:34 +0530 Subject: [PATCH 05/10] Add Tantivy (#4) * Tantivy changes without deleting .idea files. * gitignore changes. * Upadted files to use tantivy with index path * added missing args * replace raw body with trec doc obj * added index to body * added a single docs list. * Removed indri from Dockerfile * Remove commented code. Removed indri.py and updated README.md * Added PR changes. Removed commented code. * Added PR changes. Co-authored-by: Amit Hattimare --- .gitignore | 5 +- Dockerfile | 42 ++---- README.md | 218 ++++++++++++++++--------------- macaw/build_tantivy_index.py | 55 ++++++++ macaw/core/retrieval/__init__.py | 23 ++-- macaw/core/retrieval/indri.py | 79 ----------- macaw/core/retrieval/tantivy.py | 47 +++++++ macaw/live_main.py | 4 +- requirements.txt | 1 + 9 files changed, 244 insertions(+), 230 deletions(-) create mode 100644 macaw/build_tantivy_index.py delete mode 100644 macaw/core/retrieval/indri.py create mode 100644 macaw/core/retrieval/tantivy.py diff --git a/.gitignore b/.gitignore index f3c7b2a..3601c50 100644 --- a/.gitignore +++ b/.gitignore @@ -103,6 +103,5 @@ venv.bak/ # mypy .mypy_cache/ -# PyCharm stuff: -.idea - +# PyCharm stuff +.idea/ diff --git a/Dockerfile b/Dockerfile index 5cef241..e95bf76 100644 --- a/Dockerfile +++ b/Dockerfile @@ -19,36 +19,18 @@ RUN apt-get update && apt-get install -y \ apt-transport-https ca-certificates \ && rm -rf /var/lib/apt/lists/* -# Download Indri search engine for document retrieval. -RUN wget --no-check-certificate https://sourceforge.net/projects/lemur/files/lemur/indri-5.11/indri-5.11.tar.gz \ - && tar -xvzf indri-5.11.tar.gz \ - && rm indri-5.11.tar.gz - -# Install Indri. -RUN cd indri-5.11 \ - && ./configure CXX="g++ -D_GLIBCXX_USE_CXX11_ABI=0" \ - && make \ - && make install - -# Install python 3.6 because 3.5 version is not compatible with new certifi changes. Gives error at runtime. -# https://github.com/certifi/python-certifi/issues/195 -# Before installing Python 3.6, install the below so that packages of requirements.txt are installed correctly later. +# Before installing Python 3.7, install the below so that packages of requirements.txt are installed correctly later. RUN apt-get update && apt-get install -y \ build-essential checkinstall \ libreadline-gplv2-dev libncursesw5-dev libssl-dev libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev libffi-dev -RUN wget "https://www.python.org/ftp/python/3.6.3/Python-3.6.3.tgz" \ - && tar -xvf Python-3.6.3.tgz \ - && cd Python-3.6.3 \ +RUN wget "https://www.python.org/ftp/python/3.7.9/Python-3.7.9.tgz" \ + && tar -xvf Python-3.7.9.tgz \ + && cd Python-3.7.9 \ && ./configure \ && make \ && make install -# Install pip but not to the latest version as it does not support pyndri installation due to Python 2.7 incompatibility. -# https://stackoverflow.com/questions/65896334 -RUN apt-get update && apt-get install -y python3-pip \ - && pip3 install --upgrade "pip < 21.0" - -RUN pip3 install pyndri +RUN apt-get update && apt-get install -y python3-pip RUN apt update && apt install -y ffmpeg @@ -87,13 +69,6 @@ RUN mkdir -p /data/db # Copy directory that contains trectext documents needed for Indri retriever. COPY trec_documents trec_documents -# Build indri index in directory /usr/src/app/indri-5.11/buildindex/my_index. Not needed if running with Bing. -ARG use_indri_retriever=true -RUN if $use_indri_retriever ; then cd indri-5.11/buildindex \ - && ./IndriBuildIndex -memory=100M -corpus.class=trectext -index=my_index \ - -stemmer.name=Krovetz -corpus.path=/usr/src/app/trec_documents \ - ; fi - # Copy files and directories from workspace to Docker container. COPY macaw macaw COPY scripts scripts @@ -104,5 +79,12 @@ ENV PYTHONPATH="$PYTHONPATH:/usr/src/app" # Install Macaw. RUN python3 setup.py install +# To fix async keyword issue in python3.7 https://github.com/pexpect/pexpect/issues/453 +RUN pip3 install -Iv pexpect==4.8.0 + +# Create index +RUN mkdir tantivy_index/ +RUN python3 macaw/build_tantivy_index.py --index_path tantivy_index/ --document_path trec_documents/ + # Run the script that will start MongoDB and run python application. CMD ["/bin/bash", "scripts/start.sh"] diff --git a/README.md b/README.md index b4c7a5e..b030766 100644 --- a/README.md +++ b/README.md @@ -1,23 +1,25 @@ # Macaw: An Extensible Conversational Information Seeking Platform + Conversational information seeking (CIS) has been recognized as a major emerging research area in information retrieval. -Such research will require data and tools, to allow the implementation and study of conversational systems. Macaw is -an open-source framework with a modular architecture for CIS research. Macaw supports *multi-turn*, *multi-modal*, and -*mixed-initiative* interactions, for tasks such as document retrieval, question answering, recommendation, and -structured data exploration. It has a modular design to encourage the study of new CIS algorithms, which can be -evaluated in batch mode. It can also integrate with a user interface, which allows user studies and data collection in -an interactive mode, where the back end can be *fully algorithmic* or a *wizard of oz* setup. - -Macaw could be of interest to the researchers and practitioners working on information retrieval, natural language +Such research will require data and tools, to allow the implementation and study of conversational systems. Macaw is an +open-source framework with a modular architecture for CIS research. Macaw supports *multi-turn*, *multi-modal*, and +*mixed-initiative* interactions, for tasks such as document retrieval, question answering, recommendation, and +structured data exploration. It has a modular design to encourage the study of new CIS algorithms, which can be +evaluated in batch mode. It can also integrate with a user interface, which allows user studies and data collection in +an interactive mode, where the back end can be *fully algorithmic* or a *wizard of oz* setup. + +Macaw could be of interest to the researchers and practitioners working on information retrieval, natural language processing, and dialogue systems. For more information on Macaw, please refer to [this paper](https://arxiv.org/pdf/1912.08904.pdf). Table of content: + + [Macaw Architecture](#macaw-architecture) + [Interfaces](#interfaces) + [Retrieval](#retrieval) + [Answer Selection and Generation](#answer-selection-and-generation) -+ [Installation](#installation) ++ [Installation](#installation) + [Running Macaw](#running-macaw) + [Bug Report and Feature Request](#bug-report-and-feature-request) + [Citation](#citation) @@ -25,6 +27,7 @@ Table of content: + [Contribution](#contribution) ## Macaw Architecture + Macaw has a modular architecture, which allows further development and extension. The high-level architecture of Macaw is presented below: @@ -33,142 +36,137 @@ is presented below: For more information on each module in Macaw, refer to this paper. #### Interfaces + Macaw supports the following interfaces: + + Standard IO: For *development* purposes + File IO: For *batch experiments* (see the examples in the `data` folder for input and output file formats) + Telegram bot: For interaction with real users Here is an example of the Telegram interface for Macaw. It supports multi-modal interactions (text, speech, click, etc). -![Telegram interface for Macaw](macaw-example-tax.jpg) +![Telegram interface for Macaw](macaw-example-tax.jpg) ![Telegram interface for Macaw](macaw-example-shakespeare.jpg) - #### Retrieval + Macaw features the following search engines: -+ [Indri](http://lemurproject.org/indri.php): an open-source search engine that can be used for any arbitrary text -collection. + ++ [Tantivy](https://github.com/quickwit-oss/tantivy): Tantivy is a full-text search engine library written in Rust. + Bing web search API: sending a request to the Bing API and getting the results. #### Answer Selection and Generation -For question answering, Macaw only features [the DrQA model](https://github.com/facebookresearch/DrQA) in its current -version. +For question answering, Macaw only features [the DrQA model](https://github.com/facebookresearch/DrQA) in its current +version. ## Installation and running with Docker -The package has been tested with certain dependencies and it is much easier to reproduce it in a similar environment. It has -been integrated with Docker to make it compatible with all operating systems. The default Docker setup runs the application -using the Standard IO interface, uses Indri for document retrieval, and DrQA for MRC (answer selection). To run using other -settings, appropriate changes should be done. -The first step is to install [Docker](https://docs.docker.com/engine/install/) in your system. Then continue with the below -steps. +The package has been tested with certain dependencies and it is much easier to reproduce it in a similar environment. It +has been integrated with Docker to make it compatible with all operating systems. The default Docker setup runs the +application using the Standard IO interface, uses Tantivy for document retrieval, and DrQA for MRC (answer selection). To +run using other settings, appropriate changes should be done. + +The first step is to install [Docker](https://docs.docker.com/engine/install/) in your system. Then continue with the +below steps. #### Create the build -To reduce the size of the build, we can keep certain data outside the Docker container and mount it using [volumes](https://docs.docker.com/storage/volumes/). -1. Download the Stanford Core NLP data from [here](http://nlp.stanford.edu/software/stanford-corenlp-full-2017-06-09.zip) and -put the directory `stanford-corenlp-full-2017-06-09` in your project root directory. -1. Install [DrQA](https://github.com/facebookresearch/DrQA) in a separate workspace and download the pre-trained model. It -stores the models in `data/reader/` directory. We will use the downloaded *multitask.mdl* model. -Once you have the two downloads done, run the below command from project root to create a docker build with name *macaw*: +To reduce the size of the build, we can keep certain data outside the Docker container and mount it +using [volumes](https://docs.docker.com/storage/volumes/). + +1. Download the Stanford Core NLP data + from [here](http://nlp.stanford.edu/software/stanford-corenlp-full-2017-06-09.zip) and put the + directory `stanford-corenlp-full-2017-06-09` in your project root directory. +1. Install [DrQA](https://github.com/facebookresearch/DrQA) in a separate workspace and download the pre-trained model. + It stores the models in `data/reader/` directory. We will use the downloaded *multitask.mdl* model. + +Once you have the two downloads done, run the below command from project root to create a docker build with name * +macaw*: + ```commandline docker build -t macaw . ``` -If you don't want to pre-install DrQA model and Stanford Core NLP data, create the build using the below command. It will -install both dependencies for you and keep them as part of the build. Note that this will significantly increase the build -size (by ~400MB for Stanford CoreNLP and by ~7.5GB for DrQA). +If you don't want to pre-install DrQA model and Stanford Core NLP data, create the build using the below command. It +will install both dependencies for you and keep them as part of the build. Note that this will significantly increase +the build size (by ~400MB for Stanford CoreNLP and by ~7.5GB for DrQA). + ```commandline docker build --build-arg download_stanford_corenlp=true --build-arg download_drqa_model=true -t macaw . ``` #### Run the application -If you downloaded certain data locally, then use Docker volumes to mount local directory to Docker container. You need to -provide the local directory path during runtime. Run the command from project root. + +If you downloaded certain data locally, then use Docker volumes to mount local directory to Docker container. You need +to provide the local directory path during runtime. Run the command from project root. + ```commandline docker run --rm -i --name=macaw_test_container \ -v :/usr/src/app/DrQA/data \ -v $("pwd")/stanford-corenlp-full-2017-06-09:/usr/src/app/stanford-corenlp-full-2017-06-09 \ macaw ``` -`` could be `/Users/amitgh/PycharmProjects/DrQA/data` if you downloaded the pre-trained model -in a separate workspace named DrQA. + +`` could be `/Users/amitgh/PycharmProjects/DrQA/data` if you downloaded the pre-trained model in a +separate workspace named DrQA. If you did not separately download data at build time, simply run: + ```commandline docker run --rm -i --name=macaw_test_container macaw ``` -In above command we start a container with name *macaw_test_container* from build image *macaw* in interactive mode (`-i`) -and remove the container when the application exits (`--rm`). After installing all dependencies, it runs `scripts/start.sh` +In above command we start a container with name *macaw_test_container* from build image *macaw* in interactive +mode (`-i`) +and remove the container when the application exits (`--rm`). After installing all dependencies, it +runs `scripts/start.sh` which first starts MongoDB server in a separate thread and then runs `live_main.py`. #### ssh into the container -While the application is running, we can go inside the container to see the contents (directory structure, indri index, etc.). + +While the application is running, we can go inside the container to see the contents (directory structure, Tantivy index, +etc.). + ```commandline docker exec -it macaw_test_container /bin/bash ``` -#### Updating TREC data for Indri -Indri index is created using the document stored in `trec_documents/` directory. It has some default data. To create a bigger -index, download the entire data from [archive](https://archive.org/details/trec-ir) and put it in trec_documents. Docker will -copy it during build time and create a new index. Index creation parameters like *-memory*, *-stemmer.name* can be changed -inside the Dockerfile. - -### A note on dependencies -Pyndri is tested only with [Indri-5.11](https://sourceforge.net/projects/lemur/files/lemur/indri-5.11/) which in turn only -supports certain OS. We use Ubuntu-16.04 in our Docker container. Ubuntu 18.04 creates issues during indri index creation -because of some unsupported C++11 functions. *pip* version <21.0 is needed to properly install pyndri. +#### Updating TREC data for Tantivy +Tantivy index is created using the document stored in `trec_documents/` directory. It has some default data. To create a +bigger index, download the entire data from [archive](https://archive.org/details/trec-ir) and put it in trec_documents. +Docker will copy it during build time and create a new index. ## Local Setup To setup the package locally without using Docker, follow the below instructions. ### Installation -Macaw requires `Python >= 3.5` and `pip3`. If you don't have `setuptools`, run `sudo pip3 install setuptools`. -To install Macaw, first **clone macaw** from this repo and then follow the following installation steps. The -mentioned installation commands can be executed on Ubuntu. You can use the same or similar commands on other Linux -distribution. If you are using Windows 10, we recommend installing Macaw and all the required packages on + +Macaw requires `Python >= 3.5` and `pip3`. If you don't have `setuptools`, run `sudo pip3 install setuptools`. To +install Macaw, first **clone macaw** from this repo and then follow the following installation steps. The mentioned +installation commands can be executed on Ubuntu. You can use the same or similar commands on other Linux distribution. +If you are using Windows 10, we recommend installing Macaw and all the required packages on [Windows Subsystem for Linux](https://docs.microsoft.com/en-us/windows/wsl/install-win10). #### Step 1: Installing MongoDB server + Macaw uses MongoDB for storing and retrieving user interactions (conversations). To install MongoDB server, run the following command: -``` -sudo apt-get install mongodb-server-core -``` -#### Step 2: Installing Indri and Pyndri -[Indri](http://lemurproject.org/indri.php) is an open-source search engine for information retrieval research, -implemented as part of the [Lemur Project](http://lemurproject.org/). -[Pyndri](https://github.com/cvangysel/pyndri) is a python interface to Indri. Macaw uses Indri for retrieving documents -from an arbitrary text collection. -To install Indri, first download Indri from https://sourceforge.net/projects/lemur/files/lemur/. As suggested by pyndri, -we have used Indri-5.11. This Indri version can be installed as follows: -``` -# download indri-5.11.tar.gz -sudo apt install g++ zlib1g-dev -tar xzvf indri-5.11.tar.gz -rm indri-5.11.tar.gz -cd indri-5.11 -./configure CXX="g++ -D_GLIBCXX_USE_CXX11_ABI=0" -make -sudo make install ``` - -Then, clone the pyndri repository from https://github.com/cvangysel/pyndri and run the following command: -``` -python3 setup.py install +sudo apt-get install mongodb-server-core ``` At this step, you can make sure your installation is complete by running the pyndri tests. -#### Step 3: Installing Stanford Core NLP -Stanford Core NLP can be used for tokenization and most importantly for co-reference resolution. If you do not need -co-reference resolution, you can ignore this step. Stanford Core NLP requires `java`. Get it by following these +#### Step 2: Installing Stanford Core NLP + +Stanford Core NLP can be used for tokenization and most importantly for co-reference resolution. If you do not need +co-reference resolution, you can ignore this step. Stanford Core NLP requires `java`. Get it by following these commands: + ``` wget -O "stanford-corenlp-full-2017-06-09.zip" "http://nlp.stanford.edu/software/stanford-corenlp-full-2017-06-09.zip" sudo apt-get install unzip @@ -177,15 +175,17 @@ rm "stanford-corenlp-full-2017-06-09.zip" ``` If you don't have `java`, install it using: + ``` sudo apt-get install default-jre ``` -#### Step 4: Installing DrQA -Macaw also supports answer extraction / generation for user queries from retrieved documents. For this purpose, it +#### Step 3: Installing DrQA + +Macaw also supports answer extraction / generation for user queries from retrieved documents. For this purpose, it features [DrQA](https://github.com/facebookresearch/DrQA). If you do not need this functionality, ignore this step (you -can also install this later). -To install DrQA, run the following commands: +can also install this later). To install DrQA, run the following commands: + ``` git clone https://github.com/facebookresearch/DrQA.git cd DrQA @@ -194,21 +194,27 @@ pip3 install torch sudo python3 setup.py develop ``` -To use pre-trained DrQA model, use the following command. +To use pre-trained DrQA model, use the following command. + ``` ./download.sh ``` + This downloads a 7.5GB (compressed) file and requires 25GB (uncompressed) space. This may take a while! -#### Step 5: Installing FFmpeg -To support speech interactions with users, Macaw requires FFmpeg for some multimedia processing steps. If you don't -need a speech support from Macaw, you can skip this step. To install FFmpeg, run the following command: +#### Step 4: Installing FFmpeg + +To support speech interactions with users, Macaw requires FFmpeg for some multimedia processing steps. If you don't need +a speech support from Macaw, you can skip this step. To install FFmpeg, run the following command: + ``` sudo apt-get install ``` -#### Step 6: Installing Macaw +#### Step 5: Installing Macaw + After cloning Macaw, use the following commands for installation: + ``` cd macaw sudo pip3 install -r requirements.txt @@ -216,23 +222,26 @@ sudo python3 setup.py install ``` ### Running Macaw + If you run macaw with interactive (or live) mode, you should first run MongoDB server using the following command: + ``` sudo mongod ``` -Note that this command uses the default database directory (`/data/db`) for storing the data. You may need to create -this directory if you haven't. You can also use other locations using the `--dbpath` argument. +Note that this command uses the default database directory (`/data/db`) for storing the data. You may need to create +this directory if you haven't. You can also use other locations using the `--dbpath` argument. We provide three different main scripts (i.e., app): + + `live_main.py`: An interactive conversational search and question answering system. It can use both STDIO and Telegram -interfaces. -+ `batch_ext_main.py`: A model for running experiments on a reusable dataset. This main script uses FILEIO as the -interface. + interfaces. ++ `batch_ext_main.py`: A model for running experiments on a reusable dataset. This main script uses FILEIO as the + interface. + `wizard_of_oz_main.py`: A main script for Wizard of Oz experiments. - + After selecting the desired main script, open the python file and provide the required parameters. For example, you need -to use your Bing subscription key (if using Bing), the path to Indri index (if using Indri), Telegram bot token (if +to use your Bing subscription key (if using Bing), the path to Tantivy index, Telegram bot token (if using Telegram interface), etc. in order to run the `live_main.py` script. You can further run the favorite main script as below: @@ -240,19 +249,21 @@ as below: python3 live_main.py ``` - ## Bug Report and Feature Request -For bug report and feature request, you can open an issue in github, or send an email to -[Hamed Zamani](http://hamedz.ir) at `hazamani@microsoft.com`. +For bug report and feature request, you can open an issue in github, or send an email to +[Hamed Zamani](http://hamedz.ir) at `hazamani@microsoft.com`. ## Citation + If you found Macaw useful, you can cite the following article: + ``` Hamed Zamani and Nick Craswell, "Macaw: An Extensible Conversational Information Seeking System", arxiv pre-print. ``` bibtex: + ``` @article{macaw, title={Macaw: An Extensible Conversational Information Seeking Platform}, @@ -262,20 +273,19 @@ bibtex: } ``` - ## License -Macaw is distributed under the **MIT License**. See the `LICENSE` file for more information. +Macaw is distributed under the **MIT License**. See the `LICENSE` file for more information. ## Contribution -This project welcomes contributions and suggestions. Most contributions require you to agree to a -Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us -the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. +This project welcomes contributions and suggestions. Most contributions require you to agree to a Contributor License +Agreement (CLA) declaring that you have the right to, and actually do, grant us the rights to use your contribution. For +details, visit https://cla.opensource.microsoft.com. -When you submit a pull request, a CLA bot will automatically determine whether you need to provide -a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions -provided by the bot. You will only need to do this once across all repos using our CLA. +When you submit a pull request, a CLA bot will automatically determine whether you need to provide a CLA and decorate +the PR appropriately (e.g., status check, comment). Simply follow the instructions provided by the bot. You will only +need to do this once across all repos using our CLA. This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or diff --git a/macaw/build_tantivy_index.py b/macaw/build_tantivy_index.py new file mode 100644 index 0000000..e584c7f --- /dev/null +++ b/macaw/build_tantivy_index.py @@ -0,0 +1,55 @@ +import argparse +import os +from typing import List + +import tantivy + +from macaw.core.retrieval.doc import get_trec_doc + + +def get_trec_docs(documents_path: str) -> List[str]: + # can be optimized + doc_list = [] + raw_body_list = [] + for trec_files in os.listdir(documents_path): + if not trec_files.startswith('.'): + with open(os.path.join(documents_path, trec_files), 'r', encoding="utf-8") as fobj: + trec_txt = fobj.read() + raw_body_list.append(trec_txt) + doc_list.append(get_trec_doc(trec_txt)) + return doc_list, raw_body_list + + +def main(index_path, documents_path): + # build the schema + schema_builder = tantivy.SchemaBuilder() + schema_builder.add_text_field("body", stored=True) + schema_builder.add_unsigned_field("doc_id", stored=True) + schema = schema_builder.build() + # create index + index = tantivy.Index(schema, path=index_path) + # read all trec doc + documents, raw_docs = get_trec_docs(documents_path) + # add documents + print('Building sparse index of {} docs...'.format(len(documents))) + writer = index.writer() + for i, doc in enumerate(raw_docs): + writer.add_document(tantivy.Document( + body=[doc], # this is the raw text of the trec document + doc_id=i + )) + if (i + 1) % 100000 == 0: + writer.commit() + print('Indexed {} docs'.format(i + 1)) + writer.commit() + print('Built sparse index') + index.reload() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Index documents') + parser.add_argument('--index_path', type=str, help='path to store the index') + parser.add_argument('--document_path', type=str, help='path for documents to index') + args = parser.parse_args() + + main(args.index_path, args.document_path) diff --git a/macaw/core/retrieval/__init__.py b/macaw/core/retrieval/__init__.py index 994182e..597b960 100644 --- a/macaw/core/retrieval/__init__.py +++ b/macaw/core/retrieval/__init__.py @@ -4,7 +4,7 @@ Authors: Hamed Zamani (hazamani@microsoft.com) """ import macaw.core.retrieval.bing_api -import macaw.core.retrieval.indri +import macaw.core.retrieval.tantivy from macaw.core.retrieval import search_engine, query_generation @@ -28,17 +28,16 @@ def get_retrieval_model(params): raise Exception('The requested query generation model does not exist!') params['logger'].info('The search engine for retrieval: ' + params['search_engine']) - if params['search_engine'] == 'indri': - return macaw.core.retrieval.indri.Indri({'query_generation': q_generation, - 'indri_path': params['search_engine_path'], - 'index': params['col_index'], - 'text_format': params['col_text_format'], - 'results_requested': params['results_requested'], - 'logger': params['logger']}) + if params['search_engine'] == 'tantivy': + return macaw.core.retrieval.tantivy.Tantivy({'query_generation': q_generation, + 'path': params['search_engine_path'], + 'load': True, + 'results_requested': params['results_requested'], + 'logger': params['logger']}) elif params['search_engine'] == 'bing': return macaw.core.retrieval.bing_api.BingWebSearch({'query_generation': q_generation, - 'bing_key': params['bing_key'], - 'results_requested': params['results_requested'], - 'logger': params['logger']}) + 'bing_key': params['bing_key'], + 'results_requested': params['results_requested'], + 'logger': params['logger']}) else: - raise Exception('The requested retrieval model does not exist!') \ No newline at end of file + raise Exception(f'{params["search_engine"]} retrieval model does not exist!') diff --git a/macaw/core/retrieval/indri.py b/macaw/core/retrieval/indri.py deleted file mode 100644 index 77bb073..0000000 --- a/macaw/core/retrieval/indri.py +++ /dev/null @@ -1,79 +0,0 @@ -""" -The Indri search engine. - -Authors: Hamed Zamani (hazamani@microsoft.com) -""" - -import os -import subprocess - -import pyndri - -from macaw.core.retrieval.doc import get_trec_doc -from macaw.core.retrieval.search_engine import Retrieval - - -class Indri(Retrieval): - def __init__(self, params): - """ - The Indri retrieval model. Indri is an open-source search engine implemented as part of the lemur project by - UMass Amherst and CMU. Refer to http://lemurproject.org/indri.php for more information. - The retrieval model used here is based on language modeling framework and retrieves documents using the query - likelihood retrieval model [Ponte & Croft; SIGIR 1998] and Dirichlet prior smoothing [Zhai and Lafferty; SIGIR - 2001]. It is implemented using the Pyndri [Van Gysel et al.; ECIR 2017], which is a python interface to Indri. - Refer to http://lemurproject.org/indri.php for more information on the Lemur toolkit. - - Args: - params(dict): A dict containing some parameters. Here is the list of all required parameters: - 'indri_path': The path to the installed Indri toolkit. - 'index': The path to the Indri index constructed from the collection. - 'results_requested': The maximum number of requested documents for retrieval. If not given, it is set to 1. - 'text_format': The text format for document collection (e.g., 'trectext'). - Note that the parameters 'query_generation' and 'logger' are required by the parent class. - """ - super().__init__(params) - self.results_requested = self.params['results_requested'] if 'results_requested' in self.params else 1 - self.indri_path = self.params['indri_path'] - self.index = pyndri.Index(self.params['index']) - self.term2id, self.id2term, self.id2df = self.index.get_dictionary() - self.id2tf = self.index.get_term_frequencies() - - def retrieve(self, query): - """ - This method retrieve documents in response to the given query. - - Args: - query(str): The query string. - - Returns: - A list of Documents with the maximum length of the 'results_requested' parameter. - """ - int_results = self.index.query(query, results_requested=self.results_requested) - results = [] - for int_doc_id, score in int_results: - # ext_doc_id, content_term_id = self.index.document(int_doc_id) - # index_content = [self.id2term[term_id] if term_id> 0 else 'UNK' for term_id in content_term_id] - doc = self.get_doc_from_index(int_doc_id)[0] - doc.score = score - doc.id = str(int_doc_id) - results.append(doc) - return results - - def get_doc_from_index(self, doc_id): - """ - This method retrieves a document content for a given document id. - - Args: - doc_id(str): The document ID. - - Returns: - A Document from the collection whose ID is equal to the given doc_id. For some reasons, the method returns - a list of Documents with a length of 1. - """ - content = subprocess.run([os.path.join(self.indri_path, 'dumpindex/dumpindex'), self.params['index'], - 'dt', str(doc_id)], stdout=subprocess.PIPE).stdout.decode('UTF-8') - if self.params['text_format'] == 'trectext': - doc = get_trec_doc(content) - else: - raise Exception('The requested text format is not supported!') - return [doc] \ No newline at end of file diff --git a/macaw/core/retrieval/tantivy.py b/macaw/core/retrieval/tantivy.py new file mode 100644 index 0000000..230285a --- /dev/null +++ b/macaw/core/retrieval/tantivy.py @@ -0,0 +1,47 @@ +""" +The Tantivy search engine. + +Authors: Arkin Dharawat (adharawat@umass.edu) +""" + +import os +import subprocess + +import tantivy + +from macaw.core.retrieval.doc import get_trec_doc +from macaw.core.retrieval.search_engine import Retrieval + + +class Tantivy(Retrieval): + def __init__(self, params): + super().__init__(params) + self.results_requested = self.params['results_requested'] if 'results_requested' in self.params else 1 + + if not os.path.exists(params['path']): + os.mkdir(params['path']) + schema_builder = tantivy.SchemaBuilder() + schema_builder.add_text_field("body", stored=True) + schema_builder.add_unsigned_field("doc_id", stored=True) + schema = schema_builder.build() + self.index = tantivy.Index(schema, path=params['path'], reuse=params['load']) + self.searcher = self.index.searcher() + self.logger = params['logger'] + self.logger.info('Loaded index and searcher') + + def retrieve(self, query): + docs = [] + try: + query = self.index.parse_query(query, ["body"]) + scores = self.searcher.search(query, self.results_requested).hits + # docs = [(self.searcher.doc(doc_id)['doc_id'], score) for score, doc_id in scores] + docs = [get_trec_doc(self.searcher.doc(doc_id)['body'][0]) + for _, doc_id in scores] # convert the raw text into trec-doc + except Exception as e: + self.logger.error(f'Error on Query {e}') + return None + + return docs + + def get_doc_from_index(self, doc_id): + return [get_trec_doc(self.searcher.doc(doc_id)['body'])] diff --git a/macaw/live_main.py b/macaw/live_main.py index 8de33f9..29d1602 100644 --- a/macaw/live_main.py +++ b/macaw/live_main.py @@ -79,9 +79,9 @@ def run(self): # These are parameters used by the retrieval model. retrieval_params = {'query_generation': 'simple', # the model that generates a query from a conversation history. 'use_coref': True, # True, if query generator can use coreference resolution, otherwise False. - 'search_engine': 'indri', # the search engine. It can be either 'indri' or 'bing'. + 'search_engine': 'tantivy', # the search engine. 'bing_key': 'YOUR_BING_SUBSCRIPTION_KEY', # Bing API key - 'search_engine_path': '/usr/src/app/indri-5.11', # The path to the indri toolkit. + 'search_engine_path': 'tantivy_index/', # The path to the tantivy index. 'col_index': '/usr/src/app/indri-5.11/buildindex/my_index', # The path to the indri index. 'col_text_format': 'trectext', # collection text format. Standard 'trectext' is only supported. 'results_requested': 3} # Maximum number of docs that should be retrieved by search engine. diff --git a/requirements.txt b/requirements.txt index 4a068f5..5c4c74a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,4 @@ pydub python-telegram-bot==12.0.0 stanfordcorenlp google-cloud-texttospeech +tantivy From 4858543cb69ab27766a7081fef5d85222cdde4dc Mon Sep 17 00:00:00 2001 From: George Wei <44126639+gzhihongwei@users.noreply.github.com> Date: Wed, 1 Jun 2022 00:43:28 -0400 Subject: [PATCH 06/10] Upgrade docker image (#5) * Upgraded Ubuntu and mongodb * Added dependencies to gitignore * Added arm64 instructions/warning * Resolve async keyword issue for python3.7+ Co-authored-by: Amit Hattimare --- .gitignore | 4 ++++ Dockerfile | 22 +++++---------------- README.md | 58 +++++++++++++++++++++++++++++------------------------- 3 files changed, 40 insertions(+), 44 deletions(-) diff --git a/.gitignore b/.gitignore index 3601c50..70f77d2 100644 --- a/.gitignore +++ b/.gitignore @@ -105,3 +105,7 @@ venv.bak/ # PyCharm stuff .idea/ + +# Dependencies +DrQA/ +stanford-corenlp-full-* diff --git a/Dockerfile b/Dockerfile index e95bf76..b227825 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,4 @@ -FROM ubuntu:16.04 -MAINTAINER ahattimare@umass.edu +FROM ubuntu:20.04 WORKDIR /usr/src/app @@ -19,22 +18,11 @@ RUN apt-get update && apt-get install -y \ apt-transport-https ca-certificates \ && rm -rf /var/lib/apt/lists/* -# Before installing Python 3.7, install the below so that packages of requirements.txt are installed correctly later. -RUN apt-get update && apt-get install -y \ - build-essential checkinstall \ - libreadline-gplv2-dev libncursesw5-dev libssl-dev libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev libffi-dev -RUN wget "https://www.python.org/ftp/python/3.7.9/Python-3.7.9.tgz" \ - && tar -xvf Python-3.7.9.tgz \ - && cd Python-3.7.9 \ - && ./configure \ - && make \ - && make install - RUN apt-get update && apt-get install -y python3-pip RUN apt update && apt install -y ffmpeg -RUN pip3 install torch +RUN pip3 install --upgrade pip && pip3 install torch # Install all dependencies mentioned in the macaw requirements document. COPY requirements.txt requirements.txt @@ -59,8 +47,8 @@ ARG download_drqa_model=false RUN if $download_drqa_model ; then cd DrQA && ./download.sh ; fi # Install MongoDB server https://docs.mongodb.com/manual/tutorial/install-mongodb-on-ubuntu/ -RUN wget -qO - https://www.mongodb.org/static/pgp/server-4.4.asc | apt-key add - -RUN echo "deb [ arch=amd64,arm64 ] https://repo.mongodb.org/apt/ubuntu xenial/mongodb-org/4.4 multiverse" | tee /etc/apt/sources.list.d/mongodb-org-4.4.list +RUN wget -qO - https://www.mongodb.org/static/pgp/server-5.0.asc | apt-key add - +RUN echo "deb [ arch=amd64,arm64 ] https://repo.mongodb.org/apt/ubuntu focal/mongodb-org/5.0 multiverse" | tee /etc/apt/sources.list.d/mongodb-org-5.0.list RUN apt-get update && apt-get install -y mongodb-org # Create the MongoDB data directory. @@ -79,7 +67,7 @@ ENV PYTHONPATH="$PYTHONPATH:/usr/src/app" # Install Macaw. RUN python3 setup.py install -# To fix async keyword issue in python3.7 https://github.com/pexpect/pexpect/issues/453 +# To fix async keyword issue in python3.7+ https://github.com/pexpect/pexpect/issues/453 RUN pip3 install -Iv pexpect==4.8.0 # Create index diff --git a/README.md b/README.md index b030766..616bff9 100644 --- a/README.md +++ b/README.md @@ -2,11 +2,11 @@ Conversational information seeking (CIS) has been recognized as a major emerging research area in information retrieval. Such research will require data and tools, to allow the implementation and study of conversational systems. Macaw is an -open-source framework with a modular architecture for CIS research. Macaw supports *multi-turn*, *multi-modal*, and -*mixed-initiative* interactions, for tasks such as document retrieval, question answering, recommendation, and +open-source framework with a modular architecture for CIS research. Macaw supports _multi-turn_, _multi-modal_, and +_mixed-initiative_ interactions, for tasks such as document retrieval, question answering, recommendation, and structured data exploration. It has a modular design to encourage the study of new CIS algorithms, which can be evaluated in batch mode. It can also integrate with a user interface, which allows user studies and data collection in -an interactive mode, where the back end can be *fully algorithmic* or a *wizard of oz* setup. +an interactive mode, where the back end can be _fully algorithmic_ or a _wizard of oz_ setup. Macaw could be of interest to the researchers and practitioners working on information retrieval, natural language processing, and dialogue systems. @@ -15,16 +15,16 @@ For more information on Macaw, please refer to [this paper](https://arxiv.org/pd Table of content: -+ [Macaw Architecture](#macaw-architecture) - + [Interfaces](#interfaces) - + [Retrieval](#retrieval) - + [Answer Selection and Generation](#answer-selection-and-generation) -+ [Installation](#installation) -+ [Running Macaw](#running-macaw) -+ [Bug Report and Feature Request](#bug-report-and-feature-request) -+ [Citation](#citation) -+ [License](#license) -+ [Contribution](#contribution) +- [Macaw Architecture](#macaw-architecture) + - [Interfaces](#interfaces) + - [Retrieval](#retrieval) + - [Answer Selection and Generation](#answer-selection-and-generation) +- [Installation](#installation) +- [Running Macaw](#running-macaw) +- [Bug Report and Feature Request](#bug-report-and-feature-request) +- [Citation](#citation) +- [License](#license) +- [Contribution](#contribution) ## Macaw Architecture @@ -39,9 +39,9 @@ For more information on each module in Macaw, refer to this paper. Macaw supports the following interfaces: -+ Standard IO: For *development* purposes -+ File IO: For *batch experiments* (see the examples in the `data` folder for input and output file formats) -+ Telegram bot: For interaction with real users +- Standard IO: For _development_ purposes +- File IO: For _batch experiments_ (see the examples in the `data` folder for input and output file formats) +- Telegram bot: For interaction with real users Here is an example of the Telegram interface for Macaw. It supports multi-modal interactions (text, speech, click, etc). @@ -52,8 +52,8 @@ Here is an example of the Telegram interface for Macaw. It supports multi-modal Macaw features the following search engines: -+ [Tantivy](https://github.com/quickwit-oss/tantivy): Tantivy is a full-text search engine library written in Rust. -+ Bing web search API: sending a request to the Bing API and getting the results. +- [Tantivy](https://github.com/quickwit-oss/tantivy): Tantivy is a full-text search engine library written in Rust. +- Bing web search API: sending a request to the Bing API and getting the results. #### Answer Selection and Generation @@ -79,10 +79,9 @@ using [volumes](https://docs.docker.com/storage/volumes/). from [here](http://nlp.stanford.edu/software/stanford-corenlp-full-2017-06-09.zip) and put the directory `stanford-corenlp-full-2017-06-09` in your project root directory. 1. Install [DrQA](https://github.com/facebookresearch/DrQA) in a separate workspace and download the pre-trained model. - It stores the models in `data/reader/` directory. We will use the downloaded *multitask.mdl* model. + It stores the models in `data/reader/` directory. We will use the downloaded _multitask.mdl_ model. -Once you have the two downloads done, run the below command from project root to create a docker build with name * -macaw*: +Once you have the two downloads done, run the below command from project root to create a docker build with name _macaw_: ```commandline docker build -t macaw . @@ -96,6 +95,8 @@ the build size (by ~400MB for Stanford CoreNLP and by ~7.5GB for DrQA). docker build --build-arg download_stanford_corenlp=true --build-arg download_drqa_model=true -t macaw . ``` +_Note: To make sure that the Docker container builds without modification, an x86_64/amd64 system is required. If you have an arm64 device, then add the flag `--platform linux/amd64` to the build command._ + #### Run the application If you downloaded certain data locally, then use Docker volumes to mount local directory to Docker container. You need @@ -117,12 +118,15 @@ If you did not separately download data at build time, simply run: docker run --rm -i --name=macaw_test_container macaw ``` -In above command we start a container with name *macaw_test_container* from build image *macaw* in interactive +In above command we start a container with name _macaw_test_container_ from build image _macaw_ in interactive mode (`-i`) and remove the container when the application exits (`--rm`). After installing all dependencies, it runs `scripts/start.sh` which first starts MongoDB server in a separate thread and then runs `live_main.py`. +_Note: Similarly to requiring to requiring the additional flag of `--platform linux/amd64` to build the Docker container with an arm64 machine, running said container also requires the same flag. +:warning: **The performance of the container under this emulation will be incredibly poor. If possible, use a x86_64/amd64 system**._ + #### ssh into the container While the application is running, we can go inside the container to see the contents (directory structure, Tantivy index, @@ -172,7 +176,7 @@ wget -O "stanford-corenlp-full-2017-06-09.zip" "http://nlp.stanford.edu/software sudo apt-get install unzip unzip "stanford-corenlp-full-2017-06-09.zip" rm "stanford-corenlp-full-2017-06-09.zip" -``` +``` If you don't have `java`, install it using: @@ -208,7 +212,7 @@ To support speech interactions with users, Macaw requires FFmpeg for some multim a speech support from Macaw, you can skip this step. To install FFmpeg, run the following command: ``` -sudo apt-get install +sudo apt-get install ``` #### Step 5: Installing Macaw @@ -234,11 +238,11 @@ this directory if you haven't. You can also use other locations using the `--dbp We provide three different main scripts (i.e., app): -+ `live_main.py`: An interactive conversational search and question answering system. It can use both STDIO and Telegram +- `live_main.py`: An interactive conversational search and question answering system. It can use both STDIO and Telegram interfaces. -+ `batch_ext_main.py`: A model for running experiments on a reusable dataset. This main script uses FILEIO as the +- `batch_ext_main.py`: A model for running experiments on a reusable dataset. This main script uses FILEIO as the interface. -+ `wizard_of_oz_main.py`: A main script for Wizard of Oz experiments. +- `wizard_of_oz_main.py`: A main script for Wizard of Oz experiments. After selecting the desired main script, open the python file and provide the required parameters. For example, you need to use your Bing subscription key (if using Bing), the path to Tantivy index, Telegram bot token (if From 13074624a510fe82f464cfb7f8699f895c196963 Mon Sep 17 00:00:00 2001 From: George Wei <44126639+gzhihongwei@users.noreply.github.com> Date: Wed, 22 Jun 2022 21:34:53 -0400 Subject: [PATCH 07/10] Adds State Management capabilities to Macaw (#6) * Added vscode to gitignore * Added preliminary implementation of state manager * Deleted redundant code * Changed timestamp to use datetime.utcnow * Added CurrentAttributes to init * Added CurrentAttributes as a shared parameter * Refactored Message.from_dict * Reformatted and sorted imports * Made user_interface, user_id, text, and timestamp mandatory and added more properties * Removed injector dependency * Moved attributes to separate file * Added nlp_pipeline and user_attributes to Message and different serialization * Schema for the Messages persisted to MongoDB * cr fix. Co-authored-by: Amit Hattimare --- .gitignore | 3 +- macaw/cis.py | 59 +++-- macaw/core/input_handler/actions.py | 27 ++- macaw/core/interaction_handler/__init__.py | 9 +- macaw/core/interaction_handler/attributes.py | 23 ++ macaw/core/interaction_handler/msg.py | 60 ++++-- .../interaction_handler/user_requests_db.py | 43 ++-- .../output_handler/naive_output_selection.py | 92 ++++---- macaw/core/retrieval/__init__.py | 42 ++-- macaw/core/retrieval/query_generation.py | 41 ++-- macaw/core/retrieval/tantivy.py | 29 +-- macaw/interface/__init__.py | 18 +- macaw/interface/fileio.py | 84 +++++--- macaw/interface/speech_recognition.py | 39 ++-- macaw/interface/stdio.py | 68 +++--- macaw/interface/telegram.py | 202 +++++++++++------- macaw/live_main.py | 85 +++++--- macaw/util/__init__.py | 27 ++- macaw/util/text_parser.py | 6 +- requirements.txt | 1 + schema.ts | 21 ++ 21 files changed, 617 insertions(+), 362 deletions(-) create mode 100644 macaw/core/interaction_handler/attributes.py create mode 100644 schema.ts diff --git a/.gitignore b/.gitignore index 70f77d2..24e83e0 100644 --- a/.gitignore +++ b/.gitignore @@ -103,8 +103,9 @@ venv.bak/ # mypy .mypy_cache/ -# PyCharm stuff +# IDE folders .idea/ +.vscode/ # Dependencies DrQA/ diff --git a/macaw/cis.py b/macaw/cis.py index 8954c31..6952ba8 100644 --- a/macaw/cis.py +++ b/macaw/cis.py @@ -5,11 +5,14 @@ """ from abc import ABC, abstractmethod +from datetime import datetime + from func_timeout import FunctionTimedOut from macaw import interface, util -from macaw.core.interaction_handler.user_requests_db import InteractionDB +from macaw.core.interaction_handler import CurrentAttributes from macaw.core.interaction_handler.msg import Message +from macaw.core.interaction_handler.user_requests_db import InteractionDB class CIS(ABC): @@ -22,26 +25,34 @@ def __init__(self, params): params(dict): A dict containing some parameters. """ self.params = params - if params['mode'] == 'live': - self.params['live_request_handler'] = self.live_request_handler - self.msg_db = InteractionDB(host=self.params['interaction_db_host'], - port=self.params['interaction_db_port'], - dbname=self.params['interaction_db_name']) - elif params['mode'] == 'exp': - self.params['experimental_request_handler'] = self.request_handler_func + if params["mode"] == "live": + self.params["live_request_handler"] = self.live_request_handler + self.msg_db = InteractionDB( + host=self.params["interaction_db_host"], + port=self.params["interaction_db_port"], + dbname=self.params["interaction_db_name"], + ) + self.params["curr_attrs"] = self.curr_attrs = CurrentAttributes() + elif params["mode"] == "exp": + self.params["experimental_request_handler"] = self.request_handler_func self.interface = interface.get_interface(params) try: self.nlp_util = util.NLPUtil(self.params) - self.params['nlp_util'] = self.nlp_util + self.params["nlp_util"] = self.nlp_util except Exception as ex: - self.params['logger'].warning('WARNING: There is a problem with setting up the NLP utility module. ' + str(ex)) - self.timeout = self.params['timeout'] if 'timeout' in self.params else -1 + self.params["logger"].warning( + "WARNING: There is a problem with setting up the NLP utility module. " + + str(ex) + ) + self.timeout = self.params["timeout"] if "timeout" in self.params else -1 def live_request_handler(self, msg): try: # load conversation from the database and add the current message to the database - conv = [msg] + self.msg_db.get_conv_history(user_id=msg.user_id, max_time=10 * 60 * 1000, max_count=10) + conv = [msg] + self.msg_db.get_conv_history( + user_id=msg.user_id, max_time=10*60*1000, max_count=10 + ) self.msg_db.insert_one(msg) # output_msg = func_timeout(self.timeout, self.request_handler_func, args=[conv]) @@ -50,13 +61,21 @@ def live_request_handler(self, msg): return output_msg except FunctionTimedOut: + print(f"live_request_handler method timed out.") msg_info = dict() - msg_info['msg_id'] = msg.msg_info['msg_id'] - msg_info['msg_source'] = 'system' - msg_info['msg_type'] = 'error' - text = 'Time out, no result!' - timestamp = util.current_time_in_milliseconds() - error_msg = Message(msg.user_interface, msg.user_id, msg.user_info, msg_info, text, timestamp) + msg_info["msg_id"] = msg.msg_info["msg_id"] + msg_info["msg_source"] = "system" + msg_info["msg_type"] = "error" + text = "Time out, no result!" + timestamp = datetime.utcnow() + error_msg = Message( + user_interface=msg.user_interface, + user_id=msg.user_id, + user_info=msg.user_info, + msg_info=msg_info, + text=text, + timestamp=timestamp, + ) self.msg_db.insert_one(error_msg) return error_msg @@ -77,7 +96,7 @@ def live_request_handler(self, msg): # user_info=user_info, # msg_info=msg_info, # text=str_list[i], - # timestamp=util.current_time_in_milliseconds()) + # timestamp=datetime.utcnow()) # conv_list.append(msg) # conv_list.reverse() # @@ -93,4 +112,4 @@ def request_handler_func(self, conv_list): @abstractmethod def run(self): - pass \ No newline at end of file + pass diff --git a/macaw/core/input_handler/actions.py b/macaw/core/input_handler/actions.py index 1b8fb38..d877699 100644 --- a/macaw/core/input_handler/actions.py +++ b/macaw/core/input_handler/actions.py @@ -4,9 +4,10 @@ Authors: Hamed Zamani (hazamani@microsoft.com) """ -from abc import ABC, abstractmethod -from func_timeout import func_timeout, FunctionTimedOut import traceback +from abc import ABC, abstractmethod + +from func_timeout import FunctionTimedOut, func_timeout class Action(ABC): @@ -38,7 +39,7 @@ def run(conv_list, params): Returns: A list of Documents. """ - return params['actions']['retrieval'].get_results(conv_list) + return params["actions"]["retrieval"].get_results(conv_list) class GetDocFromIndex(Action): @@ -54,7 +55,7 @@ def run(conv_list, params): Returns: A list of Documents with a length of 1. """ - return params['actions']['retrieval'].get_doc_from_index(params['doc_id']) + return params["actions"]["retrieval"].get_doc_from_index(params["doc_id"]) class QAAction(Action): @@ -73,12 +74,12 @@ def run(conv_list, params): """ doc_list = RetrievalAction.run(conv_list, params) - doc = '' + doc = "" for i in range(len(doc_list)): doc = doc_list[i].text if len(doc.strip()) > 0: break - return params['actions']['qa'].get_results(conv_list, doc) + return params["actions"]["qa"].get_results(conv_list, doc) def run_action(action, conv_list, params, return_dict): @@ -93,17 +94,21 @@ def run_action(action, conv_list, params, return_dict): return_dict(dict): A shared dict for all processes running this action. The actions' outputs should be added to this dict. """ - if action == 'retrieval': + if action == "retrieval": action_func = RetrievalAction.run - elif action == 'qa': + elif action == "qa": action_func = QAAction.run else: - raise Exception('Unknown Action!') + raise Exception("Unknown Action!") try: - return_dict[action] = func_timeout(params['timeout'], action_func, args=[conv_list, params]) + return_dict[action] = func_timeout( + params["timeout"], action_func, args=[conv_list, params] + ) except FunctionTimedOut: - params['logger'].warning('The action "%s" did not respond in %d seconds.', action, params['timeout']) + params["logger"].warning( + 'The action "%s" did not respond in %d seconds.', action, params["timeout"] + ) except Exception: return_dict[action] = None traceback.print_exc() diff --git a/macaw/core/interaction_handler/__init__.py b/macaw/core/interaction_handler/__init__.py index 70bb3a1..0bc32cc 100644 --- a/macaw/core/interaction_handler/__init__.py +++ b/macaw/core/interaction_handler/__init__.py @@ -1,5 +1,8 @@ """ -FILE DESCRIPTION +The interaction handler init. -Authors: Hamed Zamani (hazamani@microsoft.com) -""" \ No newline at end of file +Authors: Hamed Zamani (hazamani@microsoft.com), George Wei (gzwei@umass.edu) +""" +from .attributes import CurrentAttributes, UserAttributes +from .msg import Message +from .user_requests_db import InteractionDB diff --git a/macaw/core/interaction_handler/attributes.py b/macaw/core/interaction_handler/attributes.py new file mode 100644 index 0000000..77c2040 --- /dev/null +++ b/macaw/core/interaction_handler/attributes.py @@ -0,0 +1,23 @@ +""" +The attribute singleton classes. + +Authors: George Wei (gzwei@umass.edu) +""" + + +class Singleton: + def __init__(self, **kwargs): + self.__dict__.update(kwargs) + + def __new__(self, **kwargs): + if not hasattr(self, "instance"): + self.instance = super().__new__(self, **kwargs) + return self.instance + + +class CurrentAttributes(Singleton): + pass + + +class UserAttributes(Singleton): + pass diff --git a/macaw/core/interaction_handler/msg.py b/macaw/core/interaction_handler/msg.py index 1ae4d28..21ae98b 100644 --- a/macaw/core/interaction_handler/msg.py +++ b/macaw/core/interaction_handler/msg.py @@ -1,29 +1,57 @@ """ The message used to represent each interaction in Macaw. -Authors: Hamed Zamani (hazamani@microsoft.com) +Authors: Hamed Zamani (hazamani@microsoft.com), George Wei (gzwei@umass.edu) """ +from datetime import datetime +from typing import Optional, Union, Dict + +from .attributes import UserAttributes class Message: - def __init__(self, user_interface, user_id, user_info, msg_info, text, timestamp): + def __init__( + self, + user_interface: str, + user_id: Union[str, int], + text: str, + timestamp: datetime, + user_info: Optional[Dict[str, any]] = None, + msg_info: Optional[Dict[str, any]] = None, + actions: Optional[Dict[str, any]] = None, + dialog_state_tracking: Optional[Dict[str, any]] = None, + nlp_pipeline: Optional[Dict[str, any]] = None, + user_attributes: Optional[Dict[str, any]] = None, + ): """ An object for input and output Message. Args: user_interface(str): The interface name used for this message (e.g., 'telegram') - user_id(str or int): The user ID. - user_info(dict): The dict containing some more information about the user. - msg_info(dict): The dict containing some more information about the message. + user_id(str | int): The user ID. text(str): The message text. - timestamp(int): The timestamp of message in milliseconds. + timestamp(datetime.datetime): The timestamp of message. + user_info(dict): (Optional) The dict containing some more information about the user. + msg_info(dict): (Optional) The dict containing some more information about the message. + actions(dict): (Optional) The results from the various actions given the conversation history. + dialog_state_tracking(dict): (Optional) The dialog state tracking dict. + nlp_pipeline(dict): (Optional) The results from running the NLP pipeline. + user_attributes(dict): (Optional) The user attributes for this given message. """ + self.user_interface = user_interface self.user_id = user_id - self.user_info = user_info - self.msg_info = msg_info self.text = text self.timestamp = timestamp - self.user_interface = user_interface + self.user_info = user_info + self.msg_info = msg_info + self.actions = actions + self.dialog_state_tracking = dialog_state_tracking + self.nlp_pipeline = nlp_pipeline + + if user_attributes is None: + user_attributes = dict() + + self.user_attributes = UserAttributes(**user_attributes) @classmethod def from_dict(cls, msg_dict): @@ -35,10 +63,10 @@ def from_dict(cls, msg_dict): Returns: A Message object. """ - user_interface = msg_dict['user_interface'] if 'user_interface' in msg_dict else None - user_id = msg_dict['user_id'] if 'user_id' in msg_dict else None - user_info = msg_dict['user_info'] if 'user_info' in msg_dict else None - msg_info = msg_dict['msg_info'] if 'msg_info' in msg_dict else None - text = msg_dict['text'] if 'text' in msg_dict else None - timestamp = msg_dict['timestamp'] if 'timestamp' in msg_dict else None - return cls(user_interface, user_id, user_info, msg_info, text, timestamp) \ No newline at end of file + return cls(**msg_dict) + + def __iter__(self): + for attr, value in self.__dict__.items(): + yield attr, value if not isinstance( + value, UserAttributes + ) else value.__dict__ # If this doesn't work, default to `None` diff --git a/macaw/core/interaction_handler/user_requests_db.py b/macaw/core/interaction_handler/user_requests_db.py index bbb9342..11ae1a0 100644 --- a/macaw/core/interaction_handler/user_requests_db.py +++ b/macaw/core/interaction_handler/user_requests_db.py @@ -1,12 +1,13 @@ """ The conversation (or interaction) database implemented using MongoDB. -Authors: Hamed Zamani (hazamani@microsoft.com) +Authors: Hamed Zamani (hazamani@microsoft.com), George Wei (gzwei@umass.edu) """ +from datetime import datetime, timedelta + from pymongo import MongoClient -from macaw import util from macaw.core.interaction_handler.msg import Message @@ -14,23 +15,35 @@ class InteractionDB: def __init__(self, host, port, dbname): self.client = MongoClient(host, port) self.db = self.client[dbname] - self.col = self.db['macaw_msgs'] + self.col = self.db["macaw_msgs"] def insert_one(self, msg): - if msg.user_id is None or msg.text is None or msg.timestamp is None or msg.user_interface is None: - raise Exception('Each message should include a user_interface, user_id, text, and timestamp.') - self.col.insert_one(msg.__dict__) + self.col.insert_one(dict(msg)) + + def update_one(self, user_id, updates): + self.col.find_one_and_update( + {"user_id": user_id}, updates, sort=[("timestamp", -1)] + ) def get_all(self): - print('Using get_all is only recommended for development purposes. It is not efficient!') + print( + "Using get_all is only recommended for development purposes. It is not efficient!" + ) return self.dict_list_to_msg_list(self.col.find({})) def get_conv_history(self, user_id, max_time, max_count): if max_time is None: - res = self.col.find({'user_id': user_id}).sort([('timestamp', -1)]) + res = self.col.find({"user_id": user_id}, sort=[("timestamp", -1)]) else: - res = self.col.find({'user_id': user_id, - 'timestamp': {'$gt': util.current_time_in_milliseconds() - max_time}}).sort([('timestamp', -1)]) + res = self.col.find( + { + "user_id": user_id, + "timestamp": { + "$gt": datetime.utcnow() - timedelta(minutes=max_time) + }, + }, + sort=[("timestamp", -1)], + ) if max_count is not None: res = res.limit(max_count) @@ -41,8 +54,8 @@ def close(self): @staticmethod def dict_list_to_msg_list(msg_dict_list): - return [Message.from_dict(msg_dict) for msg_dict in msg_dict_list] - - - - + msg_list = [] + for msg_dict in msg_dict_list: + msg_dict.pop("_id") + msg_list.append(Message.from_dict(msg_dict=msg_dict)) + return msg_list diff --git a/macaw/core/output_handler/naive_output_selection.py b/macaw/core/output_handler/naive_output_selection.py index c7487bc..1215b83 100644 --- a/macaw/core/output_handler/naive_output_selection.py +++ b/macaw/core/output_handler/naive_output_selection.py @@ -1,12 +1,12 @@ """ The naive output post processing unit. -Authors: Hamed Zamani (hazamani@microsoft.com) +Authors: Hamed Zamani (hazamani@microsoft.com), George Wei (gzwei@umass.edu) """ +from datetime import datetime -from macaw import util -from macaw.core.output_handler.output_selection import OutputProcessing from macaw.core.interaction_handler.msg import Message +from macaw.core.output_handler.output_selection import OutputProcessing class NaiveOutputProcessing(OutputProcessing): @@ -35,20 +35,22 @@ def output_selection(self, conv_list, candidate_outputs): Returns: A str denoting the selected action. If none is selected, None is returned. """ - if '#get_doc' in candidate_outputs: - return '#get_doc' - if 'qa' in candidate_outputs: - if len(candidate_outputs['qa'][0].text) > 0: - if conv_list[0].text.endswith('?') \ - or conv_list[0].text.lower().startswith('what') \ - or conv_list[0].text.lower().startswith('who') \ - or conv_list[0].text.lower().startswith('when') \ - or conv_list[0].text.lower().startswith('where') \ - or conv_list[0].text.lower().startswith('how'): - return 'qa' - if 'retrieval' in candidate_outputs: - if len(candidate_outputs['retrieval']) > 0: - return 'retrieval' + if "#get_doc" in candidate_outputs: + return "#get_doc" + if "qa" in candidate_outputs: + if len(candidate_outputs["qa"][0].text) > 0: + if ( + conv_list[0].text.endswith("?") + or conv_list[0].text.lower().startswith("what") + or conv_list[0].text.lower().startswith("who") + or conv_list[0].text.lower().startswith("when") + or conv_list[0].text.lower().startswith("where") + or conv_list[0].text.lower().startswith("how") + ): + return "qa" + if "retrieval" in candidate_outputs: + if len(candidate_outputs["retrieval"]) > 0: + return "retrieval" return None def get_output(self, conv, candidate_outputs): @@ -68,32 +70,42 @@ def get_output(self, conv, candidate_outputs): user_id = conv[0].user_id user_info = conv[0].user_info msg_info = dict() - msg_info['msg_id'] = conv[0].msg_info['msg_id'] - msg_info['msg_source'] = 'system' - text = '' + msg_info["msg_id"] = conv[0].msg_info["msg_id"] + msg_info["msg_source"] = "system" + text = "" user_interface = conv[0].user_interface selected_action = self.output_selection(conv, candidate_outputs) if selected_action is None: - msg_info['msg_type'] = 'text' - msg_info['msg_creator'] = 'no answer error' - text = 'No response has been found! Please try again!' - elif selected_action == 'qa': - msg_info['msg_type'] = conv[0].msg_info['msg_type'] - msg_info['msg_creator'] = 'qa' - text = candidate_outputs['qa'][0].text - elif selected_action == 'retrieval': - msg_info['msg_type'] = 'options' - msg_info['msg_creator'] = 'retrieval' - text = 'Retrieved document list (click to see the document content):' - msg_info['options'] = [(output.title, '#get_doc ' + output.id, output.score) for output in candidate_outputs['retrieval']] - elif selected_action == '#get_doc': - msg_info['msg_type'] = 'text' - msg_info['msg_creator'] = '#get_doc' - text = candidate_outputs['#get_doc'][0].text + msg_info["msg_type"] = "text" + msg_info["msg_creator"] = "no answer error" + text = "No response has been found! Please try again!" + elif selected_action == "qa": + msg_info["msg_type"] = conv[0].msg_info["msg_type"] + msg_info["msg_creator"] = "qa" + text = candidate_outputs["qa"][0].text + elif selected_action == "retrieval": + msg_info["msg_type"] = "options" + msg_info["msg_creator"] = "retrieval" + text = "Retrieved document list (click to see the document content):" + msg_info["options"] = [ + (output.title, "#get_doc " + output.id, output.score) + for output in candidate_outputs["retrieval"] + ] + elif selected_action == "#get_doc": + msg_info["msg_type"] = "text" + msg_info["msg_creator"] = "#get_doc" + text = candidate_outputs["#get_doc"][0].text else: - raise Exception('The candidate output key is not familiar!') - timestamp = util.current_time_in_milliseconds() + raise Exception("The candidate output key is not familiar!") + timestamp = datetime.utcnow() if timestamp <= conv[0].timestamp: - raise Exception('There is a problem in the output timestamp!') - return Message(user_interface, user_id, user_info, msg_info, text, timestamp) \ No newline at end of file + raise Exception("There is a problem in the output timestamp!") + return Message( + user_interface=user_interface, + user_id=user_id, + user_info=user_info, + msg_info=msg_info, + text=text, + timestamp=timestamp + ) diff --git a/macaw/core/retrieval/__init__.py b/macaw/core/retrieval/__init__.py index 597b960..6ed9fdd 100644 --- a/macaw/core/retrieval/__init__.py +++ b/macaw/core/retrieval/__init__.py @@ -5,7 +5,7 @@ """ import macaw.core.retrieval.bing_api import macaw.core.retrieval.tantivy -from macaw.core.retrieval import search_engine, query_generation +from macaw.core.retrieval import query_generation def get_retrieval_model(params): @@ -21,23 +21,33 @@ def get_retrieval_model(params): Returns: A Retrieval object for document retrieval. """ - params['logger'].info('The query generation model for retrieval: ' + params['query_generation']) - if params['query_generation'] == 'simple': + params["logger"].info( + "The query generation model for retrieval: " + params["query_generation"] + ) + if params["query_generation"] == "simple": q_generation = query_generation.SimpleQueryGeneration(params) else: - raise Exception('The requested query generation model does not exist!') + raise Exception("The requested query generation model does not exist!") - params['logger'].info('The search engine for retrieval: ' + params['search_engine']) - if params['search_engine'] == 'tantivy': - return macaw.core.retrieval.tantivy.Tantivy({'query_generation': q_generation, - 'path': params['search_engine_path'], - 'load': True, - 'results_requested': params['results_requested'], - 'logger': params['logger']}) - elif params['search_engine'] == 'bing': - return macaw.core.retrieval.bing_api.BingWebSearch({'query_generation': q_generation, - 'bing_key': params['bing_key'], - 'results_requested': params['results_requested'], - 'logger': params['logger']}) + params["logger"].info("The search engine for retrieval: " + params["search_engine"]) + if params["search_engine"] == "tantivy": + return macaw.core.retrieval.tantivy.Tantivy( + { + "query_generation": q_generation, + "path": params["search_engine_path"], + "load": True, + "results_requested": params["results_requested"], + "logger": params["logger"], + } + ) + elif params["search_engine"] == "bing": + return macaw.core.retrieval.bing_api.BingWebSearch( + { + "query_generation": q_generation, + "bing_key": params["bing_key"], + "results_requested": params["results_requested"], + "logger": params["logger"], + } + ) else: raise Exception(f'{params["search_engine"]} retrieval model does not exist!') diff --git a/macaw/core/retrieval/query_generation.py b/macaw/core/retrieval/query_generation.py index 3047eb5..b1139b5 100644 --- a/macaw/core/retrieval/query_generation.py +++ b/macaw/core/retrieval/query_generation.py @@ -4,8 +4,8 @@ Authors: Hamed Zamani (hazamani@microsoft.com) """ -from abc import ABC, abstractmethod import string +from abc import ABC, abstractmethod class QueryGeneration(ABC): @@ -59,12 +59,14 @@ def get_query(self, conv_list): """ # q = ' '.join(msg.text for msg in conv_list) q = conv_list[0].text - if 'use_coref' in self.params and self.params['use_coref']: + if "use_coref" in self.params and self.params["use_coref"]: q_coref = self.get_query_coref(conv_list) for key in q_coref: - q += ' ' + ' '.join(q_coref[key]) + q += " " + " ".join(q_coref[key]) - q = q.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))).strip() + q = q.translate( + str.maketrans(string.punctuation, " " * len(string.punctuation)) + ).strip() # print(q) return q @@ -84,20 +86,20 @@ def get_query_coref(self, conv_list): """ corenlp_coref_result = self.compute_corefs(conv_list) q_coref = dict() - last_index = len(corenlp_coref_result['sentences']) - for key in corenlp_coref_result['corefs']: + last_index = len(corenlp_coref_result["sentences"]) + for key in corenlp_coref_result["corefs"]: has_coref = False - for item in corenlp_coref_result['corefs'][key]: - if item['sentNum'] == last_index: + for item in corenlp_coref_result["corefs"][key]: + if item["sentNum"] == last_index: has_coref = True - text = item['text'] + text = item["text"] break if has_coref: q_coref[text] = [] - for item in corenlp_coref_result['corefs'][key]: - if item['sentNum'] == last_index: + for item in corenlp_coref_result["corefs"][key]: + if item["sentNum"] == last_index: continue - q_coref[text].append(item['text']) + q_coref[text].append(item["text"]) return q_coref def compute_corefs(self, conv_list): @@ -115,15 +117,18 @@ def compute_corefs(self, conv_list): """ conv_history = [] for msg in reversed(conv_list): - if msg.msg_info['msg_source'] == 'user' and msg.msg_info['msg_type'] in ['text', 'voice']: - temp = msg.text if msg.text.endswith('?') else (msg.text + '?') + if msg.msg_info["msg_source"] == "user" and msg.msg_info["msg_type"] in [ + "text", + "voice", + ]: + temp = msg.text if msg.text.endswith("?") else (msg.text + "?") conv_history.append(temp) # elif msg.msg_info['msg_source'] == 'system' and msg.msg_info['msg_type'] == 'text' and len(msg.text.split()) < 30: # temp = msg.text + '.' # conv_history.append(temp) if len(conv_history) == 0: - raise Exception('The query generation model cannot generate any query! There should be a problem') - coref_results = self.params['nlp_util'].get_coref(' '.join(conv_history)) + raise Exception( + "The query generation model cannot generate any query! There should be a problem" + ) + coref_results = self.params["nlp_util"].get_coref(" ".join(conv_history)) return coref_results - - diff --git a/macaw/core/retrieval/tantivy.py b/macaw/core/retrieval/tantivy.py index 230285a..1e23c39 100644 --- a/macaw/core/retrieval/tantivy.py +++ b/macaw/core/retrieval/tantivy.py @@ -5,7 +5,6 @@ """ import os -import subprocess import tantivy @@ -16,18 +15,22 @@ class Tantivy(Retrieval): def __init__(self, params): super().__init__(params) - self.results_requested = self.params['results_requested'] if 'results_requested' in self.params else 1 - - if not os.path.exists(params['path']): - os.mkdir(params['path']) + self.results_requested = ( + self.params["results_requested"] + if "results_requested" in self.params + else 1 + ) + + if not os.path.exists(params["path"]): + os.mkdir(params["path"]) schema_builder = tantivy.SchemaBuilder() schema_builder.add_text_field("body", stored=True) schema_builder.add_unsigned_field("doc_id", stored=True) schema = schema_builder.build() - self.index = tantivy.Index(schema, path=params['path'], reuse=params['load']) + self.index = tantivy.Index(schema, path=params["path"], reuse=params["load"]) self.searcher = self.index.searcher() - self.logger = params['logger'] - self.logger.info('Loaded index and searcher') + self.logger = params["logger"] + self.logger.info("Loaded index and searcher") def retrieve(self, query): docs = [] @@ -35,13 +38,15 @@ def retrieve(self, query): query = self.index.parse_query(query, ["body"]) scores = self.searcher.search(query, self.results_requested).hits # docs = [(self.searcher.doc(doc_id)['doc_id'], score) for score, doc_id in scores] - docs = [get_trec_doc(self.searcher.doc(doc_id)['body'][0]) - for _, doc_id in scores] # convert the raw text into trec-doc + docs = [ + get_trec_doc(self.searcher.doc(doc_id)["body"][0]) + for _, doc_id in scores + ] # convert the raw text into trec-doc except Exception as e: - self.logger.error(f'Error on Query {e}') + self.logger.error(f"Error on Query {e}") return None return docs def get_doc_from_index(self, doc_id): - return [get_trec_doc(self.searcher.doc(doc_id)['body'])] + return [get_trec_doc(self.searcher.doc(doc_id)["body"])] diff --git a/macaw/interface/__init__.py b/macaw/interface/__init__.py index efebf6c..cc504ed 100644 --- a/macaw/interface/__init__.py +++ b/macaw/interface/__init__.py @@ -4,20 +4,20 @@ Authors: Hamed Zamani (hazamani@microsoft.com) """ -from macaw.interface import speech_recognition, telegram, stdio, fileio +from macaw.interface import fileio, speech_recognition, stdio, telegram def get_interface(params): - if 'asr_model' in params and params['asr_model'] == 'google': - params['asr'] = speech_recognition.GoogleASR(params) - if 'asg_model' in params and params['asg_model'] == 'google': - params['asg'] = speech_recognition.GoogleText2Speech(params) + if "asr_model" in params and params["asr_model"] == "google": + params["asr"] = speech_recognition.GoogleASR(params) + if "asg_model" in params and params["asg_model"] == "google": + params["asg"] = speech_recognition.GoogleText2Speech(params) - if params['interface'] == 'telegram': + if params["interface"] == "telegram": return telegram.TelegramBot(params) - elif params['interface'] == 'stdio': + elif params["interface"] == "stdio": return stdio.StdioInterface(params) - elif params['interface'] == 'fileio': + elif params["interface"] == "fileio": return fileio.FileioInterface(params) else: - raise Exception('The requested interface does not exist!') \ No newline at end of file + raise Exception("The requested interface does not exist!") diff --git a/macaw/interface/fileio.py b/macaw/interface/fileio.py index 467fb38..bed01f6 100644 --- a/macaw/interface/fileio.py +++ b/macaw/interface/fileio.py @@ -6,8 +6,8 @@ import time -from macaw.interface.interface import Interface from macaw.core.interaction_handler.msg import Message +from macaw.interface.interface import Interface class FileioInterface(Interface): @@ -16,46 +16,70 @@ def __init__(self, params): self.msg_id = int(time.time()) def run(self): - output_file = open(self.params['output_file_path'], 'w+') - with open(self.params['input_file_path']) as input_file: + output_file = open(self.params["output_file_path"], "w+") + with open(self.params["input_file_path"]) as input_file: for line in input_file: - str_list = line.strip().split('\t') + str_list = line.strip().split("\t") if len(str_list) < 2: - raise Exception('Each input line should contain at least 2 elements: a query ID and a query text.') + raise Exception( + "Each input line should contain at least 2 elements: a query ID and a query text." + ) qid = str_list[0] conv_list = [] for i in range(1, len(str_list)): - user_info = {'first_name': 'NONE'} - msg_info = {'msg_id': qid, - 'msg_type': 'text', - 'msg_source': 'user'} - msg = Message(user_interface='NONE', - user_id=-1, - user_info=user_info, - msg_info=msg_info, - text=str_list[i], - timestamp=-1) + user_info = {"first_name": "NONE"} + msg_info = {"msg_id": qid, "msg_type": "text", "msg_source": "user"} + msg = Message( + user_interface="NONE", + user_id=-1, + user_info=user_info, + msg_info=msg_info, + text=str_list[i], + timestamp=-1, + ) conv_list.append(msg) conv_list.reverse() - output_msg = self.params['experimental_request_handler'](conv_list) - self.result_presentation(output_msg, {'output_file': output_file, 'qid': qid}) + output_msg = self.params["experimental_request_handler"](conv_list) + self.result_presentation( + output_msg, {"output_file": output_file, "qid": qid} + ) output_file.close() def result_presentation(self, output_msg, params): - qid = params['qid'] - output_file = params['output_file'] - if self.params['output_format'] == 'trec': - if output_msg.msg_info['msg_type'] == 'options': - for (i, (option_name, option_id, output_score)) in enumerate(output_msg.msg_info['options']): - output_file.write(qid + '\tQ0\t' + option_name + '\t' + str(i+1) + '\t' + str(output_score) + '\tmacaw\n') + qid = params["qid"] + output_file = params["output_file"] + if self.params["output_format"] == "trec": + if output_msg.msg_info["msg_type"] == "options": + for (i, (option_name, option_id, output_score)) in enumerate( + output_msg.msg_info["options"] + ): + output_file.write( + qid + + "\tQ0\t" + + option_name + + "\t" + + str(i + 1) + + "\t" + + str(output_score) + + "\tmacaw\n" + ) else: - raise Exception('TREC output format is only recognized for retrieval results. ' - 'Therefore, the message type should be options.') - elif self.params['output_format'] == 'text': - if output_msg.msg_info['msg_type'] == 'text': - output_file.write(qid + '\t' + output_msg.text.replace('\n', ' ').replace('\t', ' ') + '\n') + raise Exception( + "TREC output format is only recognized for retrieval results. " + "Therefore, the message type should be options." + ) + elif self.params["output_format"] == "text": + if output_msg.msg_info["msg_type"] == "text": + output_file.write( + qid + + "\t" + + output_msg.text.replace("\n", " ").replace("\t", " ") + + "\n" + ) else: - raise Exception('text output format is only recognized for text outputs.') + raise Exception( + "text output format is only recognized for text outputs." + ) else: - raise Exception('Unknown output file format!') + raise Exception("Unknown output file format!") diff --git a/macaw/interface/speech_recognition.py b/macaw/interface/speech_recognition.py index 52a3ff1..467cfbc 100644 --- a/macaw/interface/speech_recognition.py +++ b/macaw/interface/speech_recognition.py @@ -4,30 +4,32 @@ Authors: Hamed Zamani (hazamani@microsoft.com) """ -from abc import ABC, abstractmethod import os import tempfile +from abc import ABC, abstractmethod import speech_recognition as sr from google.cloud import texttospeech from pydub import AudioSegment -def mp3_to_ogg(input_file_name): # caller should delete the file afterwards. +def mp3_to_ogg(input_file_name): # caller should delete the file afterwards. ogg_file = tempfile.NamedTemporaryFile(delete=False) - AudioSegment.from_mp3(input_file_name).export(ogg_file.name, format='ogg', parameters=["-acodec", "libopus"]) + AudioSegment.from_mp3(input_file_name).export( + ogg_file.name, format="ogg", parameters=["-acodec", "libopus"] + ) ogg_file.close() return ogg_file.name -def ogg_to_wav(input_file_name): # caller should delete the file afterwards. +def ogg_to_wav(input_file_name): # caller should delete the file afterwards. wav_file = tempfile.NamedTemporaryFile(delete=False) - AudioSegment.from_ogg(input_file_name).export(wav_file.name, format='wav') + AudioSegment.from_ogg(input_file_name).export(wav_file.name, format="wav") wav_file.close() return wav_file.name -class ASR(ABC): # Automatic Speech Recognition +class ASR(ABC): # Automatic Speech Recognition def __init__(self, params): self.params = params @@ -36,7 +38,7 @@ def speech_to_text(self, file_path): pass -class ASG(ABC): # Automatic Speech Generation +class ASG(ABC): # Automatic Speech Generation def __init__(self, params): self.params = params @@ -62,23 +64,31 @@ def speech_to_text(self, file_path): except sr.UnknownValueError: print("Google Speech Recognition could not understand audio") except sr.RequestError as e: - print("Could not request results from Google Speech Recognition service; {0}".format(e)) + print( + "Could not request results from Google Speech Recognition service; {0}".format( + e + ) + ) class GoogleText2Speech(ASG): def __init__(self, params): super().__init__(params) - os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = self.params['google-speech-to-text-credential-file'] + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self.params[ + "google-speech-to-text-credential-file" + ] # Instantiates a client self.client = texttospeech.TextToSpeechClient() # Build the voice request, select the language code ("en-US") and the ssml # voice gender ("neutral") self.voice = texttospeech.types.VoiceSelectionParams( - language_code='en-US', - ssml_gender=texttospeech.enums.SsmlVoiceGender.NEUTRAL) + language_code="en-US", + ssml_gender=texttospeech.enums.SsmlVoiceGender.NEUTRAL, + ) # Select the type of audio file you want returned self.audio_config = texttospeech.types.AudioConfig( - audio_encoding=texttospeech.enums.AudioEncoding.MP3) + audio_encoding=texttospeech.enums.AudioEncoding.MP3 + ) def text_to_speech(self, text): # Set the text input to be synthesized @@ -86,11 +96,12 @@ def text_to_speech(self, text): # Perform the text-to-speech request on the text input with the selected # voice parameters and audio file type - response = self.client.synthesize_speech(synthesis_input, self.voice, self.audio_config) + response = self.client.synthesize_speech( + synthesis_input, self.voice, self.audio_config + ) mp3_file = tempfile.NamedTemporaryFile(delete=True) mp3_file.write(response.audio_content) ogg_file_name = mp3_to_ogg(mp3_file.name) mp3_file.close() return ogg_file_name - diff --git a/macaw/interface/stdio.py b/macaw/interface/stdio.py index ffab707..ff5844e 100644 --- a/macaw/interface/stdio.py +++ b/macaw/interface/stdio.py @@ -1,15 +1,15 @@ """ The STDIO interface for interactive CIS. -Authors: Hamed Zamani (hazamani@microsoft.com) +Authors: Hamed Zamani (hazamani@microsoft.com), George Wei (gzwei@umass.edu) """ import time import traceback +from datetime import datetime -from macaw import util -from macaw.interface.interface import Interface from macaw.core.interaction_handler.msg import Message +from macaw.interface.interface import Interface class StdioInterface(Interface): @@ -20,43 +20,53 @@ def __init__(self, params): def run(self): while True: try: - request = input('ENTER YOUR COMMAND (type \'exit\' to leave): ').strip() - if request == 'exit': + request = input("ENTER YOUR COMMAND (type 'exit' to leave): ").strip() + if request == "exit": break if len(request) == 0: continue - user_info = {'first_name': 'STDIO', - 'is_bot': 'False' - } - msg_info = {'msg_id': self.msg_id, - 'msg_type': 'command' if request.startswith('#') else 'text', - 'msg_source': 'user'} + user_info = {"first_name": "STDIO", "is_bot": "False"} + msg_info = { + "msg_id": self.msg_id, + "msg_type": "command" if request.startswith("#") else "text", + "msg_source": "user", + } self.msg_id += 1 - msg = Message(user_interface='stdio', - user_id=-1, - user_info=user_info, - msg_info=msg_info, - text=request, - timestamp=util.current_time_in_milliseconds()) - output = self.params['live_request_handler'](msg) + msg = Message( + user_interface="stdio", + user_id=-1, + user_info=user_info, + msg_info=msg_info, + text=request, + timestamp=datetime.utcnow(), + ) + output = self.params["live_request_handler"](msg) self.result_presentation(output, {}) except Exception as ex: traceback.print_exc() def result_presentation(self, response_msg, params): try: - print('THE RESPONSE STARTS') - print('----------------------------------------------------------------------') - if response_msg.msg_info['msg_type'] == 'text': + print("THE RESPONSE STARTS") + print( + "----------------------------------------------------------------------" + ) + if response_msg.msg_info["msg_type"] == "text": print(response_msg.text) - elif response_msg.msg_info['msg_type'] == 'options': - for (option_text, option_data, output_score) in response_msg.msg_info['options']: - print(option_data, ' | ', option_text) - elif response_msg.msg_info['msg_type'] == 'error': - print('ERROR: NO RESULT!') + elif response_msg.msg_info["msg_type"] == "options": + for (option_text, option_data, output_score) in response_msg.msg_info[ + "options" + ]: + print(option_data, " | ", option_text) + elif response_msg.msg_info["msg_type"] == "error": + print("ERROR: NO RESULT!") else: - raise Exception('The msg_type is not recognized:', response_msg.msg_info['msg_type']) - print('----------------------------------------------------------------------') - print('THE RESPONSE ENDS') + raise Exception( + "The msg_type is not recognized:", response_msg.msg_info["msg_type"] + ) + print( + "----------------------------------------------------------------------" + ) + print("THE RESPONSE ENDS") except Exception as ex: traceback.print_exc() diff --git a/macaw/interface/telegram.py b/macaw/interface/telegram.py index b6847a8..08b19a8 100644 --- a/macaw/interface/telegram.py +++ b/macaw/interface/telegram.py @@ -1,18 +1,24 @@ """ The Telegram bot (supports interactive multi-modal interactions with different devices). -Authors: Hamed Zamani (hazamani@microsoft.com) +Authors: Hamed Zamani (hazamani@microsoft.com), George Wei (gzwei@umass.edu) """ -import urllib.parse import os import tempfile import traceback +import urllib.parse +from datetime import datetime from telegram import InlineKeyboardButton, InlineKeyboardMarkup -from telegram.ext import Updater, CommandHandler, MessageHandler, Filters, CallbackQueryHandler +from telegram.ext import ( + CallbackQueryHandler, + CommandHandler, + Filters, + MessageHandler, + Updater, +) -from macaw import util from macaw.core.interaction_handler.msg import Message from macaw.interface.interface import Interface @@ -26,21 +32,25 @@ def __init__(self, params): params(dict): A dict of parameters. The params 'logger' and 'bot_token' are mandatory. """ super().__init__(params) - self.logger = self.params['logger'] + self.logger = self.params["logger"] - self.MAX_MSG_LEN = 1000 # maximum number of characters in each response message. - self.MAX_OPTION_LEN = 30 # maximum number of characters in each clickable option text. + self.MAX_MSG_LEN = ( + 1000 # maximum number of characters in each response message. + ) + self.MAX_OPTION_LEN = ( + 30 # maximum number of characters in each clickable option text. + ) # Starting the bot by creating the Updater. # Make sure to set use_context=True to use the new context based callbacks # If you don't have a bot_token, add 'botfather' to your personal Telegram account and follow the instructions # to get a token for your bot. - self.updater = Updater(self.params['bot_token'], use_context=True) + self.updater = Updater(self.params["bot_token"], use_context=True) self.dp = self.updater.dispatcher # Telegram command handlers (e.g., /start) - self.dp.add_handler(CommandHandler('start', self.start)) - self.dp.add_handler(CommandHandler('help', self.help)) + self.dp.add_handler(CommandHandler("start", self.start)) + self.dp.add_handler(CommandHandler("help", self.help)) # Telegram message handlers self.dp.add_handler(MessageHandler(Filters.text, self.request_handler)) @@ -52,32 +62,41 @@ def __init__(self, params): def start(self, update, context): """Send a message when the command /start is issued.""" - update.message.reply_text('Hi, welcome to Macaw! Macaw is an open-source extensible framework for ' - 'conversational information seeking. Visit: https://github.com/microsoft/macaw') + update.message.reply_text( + "Hi, welcome to Macaw! Macaw is an open-source extensible framework for " + "conversational information seeking. Visit: https://github.com/microsoft/macaw" + ) def help(self, update, context): """Send a message when the command /help is issued.""" - update.message.reply_text('Macaw should be able to answer your questions. Just ask a question!') + update.message.reply_text( + "Macaw should be able to answer your questions. Just ask a question!" + ) def request_handler(self, update, context): """This method handles all text messages, and asks result_presentation to send the response to the user.""" try: self.logger.info(update.message) - user_info = {'first_name': update.message.chat.first_name, - 'last_name': update.message.chat.last_name, - 'is_bot': update._effective_user.is_bot + user_info = { + "first_name": update.message.chat.first_name, + "last_name": update.message.chat.last_name, + "is_bot": update._effective_user.is_bot, + } + msg_info = { + "msg_id": update.message.message_id, + "msg_type": "text", + "msg_source": "user", } - msg_info = {'msg_id': update.message.message_id, - 'msg_type': 'text', - 'msg_source': 'user'} - msg = Message(user_interface='telegram', - user_id=update.message.chat.id, - user_info=user_info, - msg_info=msg_info, - text=update.message.text, - timestamp=util.current_time_in_milliseconds()) - output = self.params['live_request_handler'](msg) - self.result_presentation(output, {'update': update}) + msg = Message( + user_interface="telegram", + user_id=update.message.chat.id, + user_info=user_info, + msg_info=msg_info, + text=update.message.text, + timestamp=datetime.utcnow(), + ) + output = self.params["live_request_handler"](msg) + self.result_presentation(output, {"update": update}) except Exception: traceback.print_exc() @@ -86,25 +105,30 @@ def voice_request_handler(self, update, context): try: ogg_file = tempfile.NamedTemporaryFile(delete=True) update.message.voice.get_file().download(ogg_file.name) - text = self.params['asr'].speech_to_text(ogg_file.name) + text = self.params["asr"].speech_to_text(ogg_file.name) ogg_file.close() - update.message.reply_text('Macaw heard: ' + text) - - user_info = {'first_name': update.message.chat.first_name, - 'last_name': update.message.chat.last_name, - 'is_bot': update._effective_user.is_bot - } - msg_info = {'msg_id': update.message.message_id, - 'msg_type': 'voice', - 'msg_source': 'user'} - msg = Message(user_interface='telegram', - user_id=update.message.chat.id, - user_info=user_info, - msg_info=msg_info, - text=text, - timestamp=util.current_time_in_milliseconds()) - output = self.params['live_request_handler'](msg) - self.result_presentation(output, {'update': update}) + update.message.reply_text("Macaw heard: " + text) + + user_info = { + "first_name": update.message.chat.first_name, + "last_name": update.message.chat.last_name, + "is_bot": update._effective_user.is_bot, + } + msg_info = { + "msg_id": update.message.message_id, + "msg_type": "voice", + "msg_source": "user", + } + msg = Message( + user_interface="telegram", + user_id=update.message.chat.id, + user_info=user_info, + msg_info=msg_info, + text=text, + timestamp=datetime.utcnow(), + ) + output = self.params["live_request_handler"](msg) + self.result_presentation(output, {"update": update}) except Exception: traceback.print_exc() @@ -112,21 +136,26 @@ def button_click_handler(self, update, context): """This method handles clicks, and asks result_presentation to send the response to the user.""" try: self.logger.info(update) - user_info = {'first_name': update.callback_query.message.chat.first_name, - 'last_name': update.callback_query.message.chat.last_name, - 'is_bot': update._effective_user.is_bot - } - msg_info = {'msg_id': update.callback_query.message.message_id, - 'msg_type': 'command', - 'msg_source': 'user'} - msg = Message(user_interface='telegram', - user_id=update.callback_query.message.chat.id, - user_info=user_info, - msg_info=msg_info, - text=update.callback_query.data, - timestamp=util.current_time_in_milliseconds()) - output = self.params['live_request_handler'](msg) - self.result_presentation(output, {'update': update}) + user_info = { + "first_name": update.callback_query.message.chat.first_name, + "last_name": update.callback_query.message.chat.last_name, + "is_bot": update._effective_user.is_bot, + } + msg_info = { + "msg_id": update.callback_query.message.message_id, + "msg_type": "command", + "msg_source": "user", + } + msg = Message( + user_interface="telegram", + user_id=update.callback_query.message.chat.id, + user_info=user_info, + msg_info=msg_info, + text=update.callback_query.data, + timestamp=datetime.utcnow(), + ) + output = self.params["live_request_handler"](msg) + self.result_presentation(output, {"update": update}) except Exception as ex: traceback.print_exc() @@ -135,30 +164,50 @@ def result_presentation(self, response_msg, params): try: if response_msg is None: return - update = params['update'] - if response_msg.msg_info['msg_type'] == 'text': + update = params["update"] + if response_msg.msg_info["msg_type"] == "text": if update.message is not None: - update.message.reply_text(response_msg.text[:self.MAX_MSG_LEN]) + update.message.reply_text(response_msg.text[: self.MAX_MSG_LEN]) elif update.callback_query.message is not None: - update.callback_query.message.reply_text(response_msg.text[:self.MAX_MSG_LEN]) - elif response_msg.msg_info['msg_type'] == 'voice': - ogg_file_name = self.params['asg'].text_to_speech(response_msg.text[:self.MAX_MSG_LEN]) - self.updater.bot.send_voice(chat_id=update.message.chat.id, voice=open(ogg_file_name, 'rb')) + update.callback_query.message.reply_text( + response_msg.text[: self.MAX_MSG_LEN] + ) + elif response_msg.msg_info["msg_type"] == "voice": + ogg_file_name = self.params["asg"].text_to_speech( + response_msg.text[: self.MAX_MSG_LEN] + ) + self.updater.bot.send_voice( + chat_id=update.message.chat.id, voice=open(ogg_file_name, "rb") + ) os.remove(ogg_file_name) # removing audio files for privacy reasons. - elif response_msg.msg_info['msg_type'] == 'options': - keyboard = [[InlineKeyboardButton(option_text[:self.MAX_OPTION_LEN], - callback_data=urllib.parse.unquote(option_data))] - for (option_text, option_data, output_score) in response_msg.msg_info['options']] + elif response_msg.msg_info["msg_type"] == "options": + keyboard = [ + [ + InlineKeyboardButton( + option_text[: self.MAX_OPTION_LEN], + callback_data=urllib.parse.unquote(option_data), + ) + ] + for ( + option_text, + option_data, + output_score, + ) in response_msg.msg_info["options"] + ] reply_markup = InlineKeyboardMarkup(keyboard) - update.message.reply_text(response_msg.text[:self.MAX_MSG_LEN], reply_markup=reply_markup) - elif response_msg.msg_info['msg_type'] == 'error': - error_msg = 'ERROR: NO RESULT!' + update.message.reply_text( + response_msg.text[: self.MAX_MSG_LEN], reply_markup=reply_markup + ) + elif response_msg.msg_info["msg_type"] == "error": + error_msg = "ERROR: NO RESULT!" if update.message is not None: update.message.reply_text(error_msg) elif update.callback_query.message is not None: update.callback_query.message.reply_text(error_msg) else: - raise Exception('The msg_type is not recognized:', response_msg.msg_info['msg_type']) + raise Exception( + "The msg_type is not recognized:", response_msg.msg_info["msg_type"] + ) except Exception: traceback.print_exc() @@ -173,10 +222,9 @@ def send_msg(self, chat_id, msg_text): def run(self): """Starting the bot!""" - self.logger.info('Running the Telegram bot!') + self.logger.info("Running the Telegram bot!") self.updater.start_polling() # Run the bot until you press Ctrl-C or the process receives SIGINT, # SIGTERM or SIGABRT. This should be used most of the time, since # start_polling() is non-blocking and will stop the bot gracefully. self.updater.idle() - diff --git a/macaw/live_main.py b/macaw/live_main.py index 29d1602..878f296 100644 --- a/macaw/live_main.py +++ b/macaw/live_main.py @@ -7,6 +7,7 @@ from macaw.cis import CIS from macaw.core import mrc, retrieval from macaw.core.input_handler.action_detection import RequestDispatcher +from macaw.core.interaction_handler import CurrentAttributes from macaw.core.output_handler import naive_output_selection from macaw.util.logging import Logger @@ -24,11 +25,12 @@ def __init__(self, params): for more information on the required parameters. """ super().__init__(params) - self.logger = params['logger'] - self.logger.info('Conversational QA Model... starting up...') + self.logger = params["logger"] + self.logger.info("Conversational QA Model... starting up...") + self.params["current_attributes"] = CurrentAttributes() self.retrieval = retrieval.get_retrieval_model(params=self.params) self.qa = mrc.get_mrc_model(params=self.params) - self.params['actions'] = {'retrieval': self.retrieval, 'qa': self.qa} + self.params["actions"] = {"retrieval": self.retrieval, "qa": self.qa} self.request_dispatcher = RequestDispatcher(self.params) self.output_selection = naive_output_selection.NaiveOutputProcessing({}) @@ -51,40 +53,47 @@ def request_handler_func(self, conv_list): def run(self): """ - This function is called to run the ConvQA system. In live mode, it never stops until the program is killed. + This function is called to run the ConvQA system. In live mode, it never stops until the program is killed. """ self.interface.run() -if __name__ == '__main__': - basic_params = {'timeout': 15, # timeout is in terms of second. - 'mode': 'live', # mode can be either live or exp. - 'logger': Logger({})} # for logging into file, pass the filepath to the Logger class. +if __name__ == "__main__": + basic_params = { + "timeout": 15, # timeout is in terms of second. + "mode": "live", # mode can be either live or exp. + "logger": Logger({}), + } # for logging into file, pass the filepath to the Logger class. # These are required database parameters if the mode is 'live'. The host and port of the machine hosting the # database, as well as the database name. - db_params = {'interaction_db_host': 'localhost', - 'interaction_db_port': 27017, - 'interaction_db_name': 'macaw_test'} + db_params = { + "interaction_db_host": "localhost", + "interaction_db_port": 27017, + "interaction_db_name": "macaw_test", + } # These are interface parameters. They are interface specific. - interface_params = {'interface': 'stdio', # interface can be 'telegram' or 'stdio' for live mode, and 'fileio' - # for experimental mode. - 'bot_token': 'YOUR_TELEGRAM_BOT_TOKEN', # Telegram bot token. - # 'asr_model': 'google', # The API used for speech recognition. - # 'asg_model': 'google', # The API used for speech generation. - 'google-speech-to-text-credential-file': 'YOUR_GOOGLE_CREDENTIAL_FILE' - } + interface_params = { + "interface": "stdio", # interface can be 'telegram' or 'stdio' for live mode, and 'fileio' + # for experimental mode. + "bot_token": "YOUR_TELEGRAM_BOT_TOKEN", # Telegram bot token. + # 'asr_model': 'google', # The API used for speech recognition. + # 'asg_model': 'google', # The API used for speech generation. + "google-speech-to-text-credential-file": "YOUR_GOOGLE_CREDENTIAL_FILE", + } # These are parameters used by the retrieval model. - retrieval_params = {'query_generation': 'simple', # the model that generates a query from a conversation history. - 'use_coref': True, # True, if query generator can use coreference resolution, otherwise False. - 'search_engine': 'tantivy', # the search engine. - 'bing_key': 'YOUR_BING_SUBSCRIPTION_KEY', # Bing API key - 'search_engine_path': 'tantivy_index/', # The path to the tantivy index. - 'col_index': '/usr/src/app/indri-5.11/buildindex/my_index', # The path to the indri index. - 'col_text_format': 'trectext', # collection text format. Standard 'trectext' is only supported. - 'results_requested': 3} # Maximum number of docs that should be retrieved by search engine. + retrieval_params = { + "query_generation": "simple", # the model that generates a query from a conversation history. + "use_coref": True, # True, if query generator can use coreference resolution, otherwise False. + "search_engine": "tantivy", # the search engine. + "bing_key": "YOUR_BING_SUBSCRIPTION_KEY", # Bing API key + "search_engine_path": "tantivy_index/", # The path to the tantivy index. + "col_index": "/usr/src/app/indri-5.11/buildindex/my_index", # The path to the indri index. + "col_text_format": "trectext", # collection text format. Standard 'trectext' is only supported. + "results_requested": 3, + } # Maximum number of docs that should be retrieved by search engine. # Note: If you want to have a re-ranking model (e.g., learning to rank), you just need to simply extend the class # core.retrieval.search_engine.ReRanker and implement the method 'rerank'. Then simply add a 'reranker' parameter to # retrieval_params that points to an instance of your favorite ReRanker class. If there is a 'reranker' parameter in @@ -92,12 +101,20 @@ def run(self): # 'get_results' in class core.retrieval.search_engine.Retrieval. # These are parameters used by the machine reading comprehension model. - mrc_params = {'mrc': 'drqa', # MRC model. - 'mrc_model_path': '/usr/src/app/DrQA/data/reader/multitask.mdl', # The path to the model parameters. - 'mrc_path': '/usr/src/app/DrQA', # The path to the model toolkit. - 'corenlp_path': '/usr/src/app/stanford-corenlp-full-2017-06-09', # The path to the corenlp toolkit. - 'qa_results_requested': 3} # The number of candidate answers returned by the MRC model. - - params = {**basic_params, **db_params, **interface_params, **retrieval_params, **mrc_params} - basic_params['logger'].info(params) + mrc_params = { + "mrc": "drqa", # MRC model. + "mrc_model_path": "/usr/src/app/DrQA/data/reader/multitask.mdl", # The path to the model parameters. + "mrc_path": "/usr/src/app/DrQA", # The path to the model toolkit. + "corenlp_path": "/usr/src/app/stanford-corenlp-full-2017-06-09", # The path to the corenlp toolkit. + "qa_results_requested": 3, + } # The number of candidate answers returned by the MRC model. + + params = { + **basic_params, + **db_params, + **interface_params, + **retrieval_params, + **mrc_params, + } + basic_params["logger"].info(params) ConvQA(params).run() diff --git a/macaw/util/__init__.py b/macaw/util/__init__.py index 5e6e731..d3c494b 100644 --- a/macaw/util/__init__.py +++ b/macaw/util/__init__.py @@ -5,21 +5,10 @@ """ import json -import time from stanfordcorenlp import StanfordCoreNLP -def current_time_in_milliseconds(): - """ - A method that returns the current time in milliseconds. - - Returns: - An int representing the current time in milliseconds. - """ - return int(round(time.time() * 1000)) - - class NLPUtil: def __init__(self, params): """ @@ -29,11 +18,15 @@ def __init__(self, params): params(dict): A dict containing some parameters. """ self.params = params - self.corenlp = StanfordCoreNLP(self.params['corenlp_path'], quiet=False) + self.corenlp = StanfordCoreNLP(self.params["corenlp_path"], quiet=False) # Pre-fetching the required models. - props = {'annotators': 'coref', 'pipelineLanguage': 'en', 'ner.useSUTime': False} - self.corenlp.annotate('', properties=props) + props = { + "annotators": "coref", + "pipelineLanguage": "en", + "ner.useSUTime": False, + } + self.corenlp.annotate("", properties=props) def get_coref(self, text): """ @@ -44,7 +37,11 @@ def get_coref(self, text): Returns: A json object containing all co-reference resolutions extracted from the input text. """ - props = {'annotators': 'coref', 'pipelineLanguage': 'en', 'ner.useSUTime': False} + props = { + "annotators": "coref", + "pipelineLanguage": "en", + "ner.useSUTime": False, + } result = json.loads(self.corenlp.annotate(text, properties=props)) return result diff --git a/macaw/util/text_parser.py b/macaw/util/text_parser.py index 8493bc3..5ae3205 100644 --- a/macaw/util/text_parser.py +++ b/macaw/util/text_parser.py @@ -4,9 +4,10 @@ Authors: Hamed Zamani (hazamani@microsoft.com) """ -import justext from xml.etree import cElementTree as ElementTree +import justext + class XmlListConfig(list): def __init__(self, aList): @@ -88,6 +89,7 @@ def xml_file_to_dict(xml_file): # result = filter(visible, data) # return ' '.join(result) + def html_to_clean_text(html): """ Converting an HTML document to clean text. @@ -102,4 +104,4 @@ def html_to_clean_text(html): for paragraph in paragraphs: if not paragraph.is_boilerplate: clean_text_list.append(paragraph.text) - return '\n'.join(clean_text_list) + return "\n".join(clean_text_list) diff --git a/requirements.txt b/requirements.txt index 5c4c74a..bf68383 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,4 @@ python-telegram-bot==12.0.0 stanfordcorenlp google-cloud-texttospeech tantivy +pymongo diff --git a/schema.ts b/schema.ts new file mode 100644 index 0000000..0e1c16f --- /dev/null +++ b/schema.ts @@ -0,0 +1,21 @@ +interface Message { + user_interface: string, + user_id: string | number, // number as in integer + text: string, + timestamp: Date, // datetime.utcnow() + user_info?: {[user_detail: string]: any}, + msg_info?: {[msg_detail: string]: any}, + actions: { + [action: string]: string | {[action_var: string]: any} + }, + dialog_state_tracking: { + state: string, + [dialog_state_tracking_var: string]: string + }, + nlp_pipeline: { + [module: string]: {[module_var: string]: any} + } + user_attributes: { + [attribute: string]: any + } +} \ No newline at end of file From 017e6a691aa3e5662a07b44cfe34a1e1897c7c7d Mon Sep 17 00:00:00 2001 From: Amit Hattimare Date: Tue, 5 Jul 2022 10:31:01 -0700 Subject: [PATCH 08/10] Feature/dialogue manager (#7) Integrate Dialogue Manager. List of all changes: 1. Added dialogue manager with DST. 2. Added nlp pipeline class. 3. Encoding and decoding of Message for storing and retrieving from DB. 4. Enabling FileIO mode. 5. Updated readme file. -------------- * Added vscode to gitignore * Added preliminary implementation of state manager * Deleted redundant code * Changed timestamp to use datetime.utcnow * Added CurrentAttributes to init * Added CurrentAttributes as a shared parameter * Refactored Message.from_dict * Reformatted and sorted imports * Made user_interface, user_id, text, and timestamp mandatory and added more properties * Removed injector dependency * Moved attributes to separate file * Added nlp_pipeline and user_attributes to Message and different serialization * Schema for the Messages persisted to MongoDB * Initial commit. Added input and output from a file for quick testing. * cr fix. * Running exp mode using command line arguments. Updated Readme file too. * minor fix. * Integrate Dialogue Manager. List of all changes: 1. Added dialogue manager with DST. 2. Added nlp pipeline class. 3. Encoding and decoding of Message for storing and retrieving from DB. 4. Enabling FileIO mode. 5. Updated readme file. * minor changes. * resolving PR comments. * minor. Co-authored-by: George Wei Co-authored-by: Amit Hattimare --- README.md | 28 +++++-- data/file_input.txt | 4 + macaw/cis.py | 73 +++++++++---------- macaw/core/dialogue_manager/__init__.py | 0 .../core/dialogue_manager/dialogue_manager.py | 28 +++++++ macaw/core/dialogue_manager/dst.py | 42 +++++++++++ macaw/core/interaction_handler/msg.py | 60 +++++++++------ .../interaction_handler/user_requests_db.py | 8 +- macaw/core/nlp_pipeline/__init__.py | 0 macaw/core/nlp_pipeline/nlp_pipeline.py | 19 +++++ .../output_handler/naive_output_selection.py | 29 ++++---- macaw/interface/fileio.py | 34 ++++----- macaw/interface/interface.py | 4 +- macaw/interface/stdio.py | 8 +- macaw/live_main.py | 51 +++++++++---- scripts/start.sh | 2 +- 16 files changed, 269 insertions(+), 121 deletions(-) create mode 100644 data/file_input.txt create mode 100644 macaw/core/dialogue_manager/__init__.py create mode 100644 macaw/core/dialogue_manager/dialogue_manager.py create mode 100644 macaw/core/dialogue_manager/dst.py create mode 100644 macaw/core/nlp_pipeline/__init__.py create mode 100644 macaw/core/nlp_pipeline/nlp_pipeline.py diff --git a/README.md b/README.md index 616bff9..aa6bf49 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,7 @@ run using other settings, appropriate changes should be done. The first step is to install [Docker](https://docs.docker.com/engine/install/) in your system. Then continue with the below steps. -#### Create the build +### Create the build To reduce the size of the build, we can keep certain data outside the Docker container and mount it using [volumes](https://docs.docker.com/storage/volumes/). @@ -97,7 +97,7 @@ docker build --build-arg download_stanford_corenlp=true --build-arg download_drq _Note: To make sure that the Docker container builds without modification, an x86_64/amd64 system is required. If you have an arm64 device, then add the flag `--platform linux/amd64` to the build command._ -#### Run the application +### Run the application If you downloaded certain data locally, then use Docker volumes to mount local directory to Docker container. You need to provide the local directory path during runtime. Run the command from project root. @@ -124,10 +124,28 @@ and remove the container when the application exits (`--rm`). After installing a runs `scripts/start.sh` which first starts MongoDB server in a separate thread and then runs `live_main.py`. -_Note: Similarly to requiring to requiring the additional flag of `--platform linux/amd64` to build the Docker container with an arm64 machine, running said container also requires the same flag. +_Note: Similar to requiring the additional flag of `--platform linux/amd64` to build the Docker container with an arm64 machine, running said container also requires the same flag. :warning: **The performance of the container under this emulation will be incredibly poor. If possible, use a x86_64/amd64 system**._ -#### ssh into the container +#### Run with file input + +To avoid typing the input every time, you can provide an input file and get output in an output file. We need to mount +the directory containing the data. + +```commandline +docker build -t macaw . && docker run --rm -i --name=macaw_test_container \ +-v :/usr/src/app/DrQA/data \ +-v $("pwd")/stanford-corenlp-full-2017-06-09:/usr/src/app/stanford-corenlp-full-2017-06-09 \ +-v $("pwd")/data:/usr/src/app/data \ +macaw +``` + +Also update the command inside `scripts/start.sh` file to +```commandline +python3 macaw/live_main.py --mode exp --interface fileio +``` + +### ssh into the container While the application is running, we can go inside the container to see the contents (directory structure, Tantivy index, etc.). @@ -136,7 +154,7 @@ etc.). docker exec -it macaw_test_container /bin/bash ``` -#### Updating TREC data for Tantivy +### Updating TREC data for Tantivy Tantivy index is created using the document stored in `trec_documents/` directory. It has some default data. To create a bigger index, download the entire data from [archive](https://archive.org/details/trec-ir) and put it in trec_documents. diff --git a/data/file_input.txt b/data/file_input.txt new file mode 100644 index 0000000..ea79eaa --- /dev/null +++ b/data/file_input.txt @@ -0,0 +1,4 @@ +q1 who is the president of japan? +q2 when did the gulf war start? +q3 what rights did the kingdom restore with israel? +q4 when did jordan regained sovereignty over its territories? \ No newline at end of file diff --git a/macaw/cis.py b/macaw/cis.py index 6952ba8..67d634e 100644 --- a/macaw/cis.py +++ b/macaw/cis.py @@ -3,16 +3,20 @@ Authors: Hamed Zamani (hazamani@microsoft.com) """ - from abc import ABC, abstractmethod from datetime import datetime +from typing import List from func_timeout import FunctionTimedOut from macaw import interface, util +from macaw.core.dialogue_manager.dialogue_manager import DialogManager +from macaw.core.input_handler.action_detection import RequestDispatcher from macaw.core.interaction_handler import CurrentAttributes from macaw.core.interaction_handler.msg import Message from macaw.core.interaction_handler.user_requests_db import InteractionDB +from macaw.core.nlp_pipeline.nlp_pipeline import NlpPipeline +from macaw.core.output_handler import naive_output_selection class CIS(ABC): @@ -27,16 +31,23 @@ def __init__(self, params): self.params = params if params["mode"] == "live": self.params["live_request_handler"] = self.live_request_handler - self.msg_db = InteractionDB( - host=self.params["interaction_db_host"], - port=self.params["interaction_db_port"], - dbname=self.params["interaction_db_name"], - ) - self.params["curr_attrs"] = self.curr_attrs = CurrentAttributes() elif params["mode"] == "exp": - self.params["experimental_request_handler"] = self.request_handler_func + self.params["experimental_request_handler"] = self.file_request_handler + + self.msg_db = InteractionDB( + host=self.params["interaction_db_host"], + port=self.params["interaction_db_port"], + dbname=self.params["interaction_db_name"], + ) + self.params["curr_attrs"] = self.curr_attrs = CurrentAttributes() + self.params["actions"] = self.generate_actions() self.interface = interface.get_interface(params) + self.request_dispatcher = RequestDispatcher(self.params) + self.output_selection = naive_output_selection.NaiveOutputProcessing({}) + self.dialogue_manager = DialogManager() + self.nlp_pipeline = NlpPipeline(params.get("nlp_modules", {})) + try: self.nlp_util = util.NLPUtil(self.params) self.params["nlp_util"] = self.nlp_util @@ -47,16 +58,19 @@ def __init__(self, params): ) self.timeout = self.params["timeout"] if "timeout" in self.params else -1 - def live_request_handler(self, msg): + def live_request_handler(self, msg: Message): try: - # load conversation from the database and add the current message to the database - conv = [msg] + self.msg_db.get_conv_history( + # load conversation from the database. + history = self.msg_db.get_conv_history( user_id=msg.user_id, max_time=10*60*1000, max_count=10 ) - self.msg_db.insert_one(msg) + + conv = [msg] + history # output_msg = func_timeout(self.timeout, self.request_handler_func, args=[conv]) output_msg = self.request_handler_func(conv) + + # Save the output and conversation state in DB. self.msg_db.insert_one(output_msg) return output_msg @@ -79,37 +93,18 @@ def live_request_handler(self, msg): self.msg_db.insert_one(error_msg) return error_msg - # def experimental_request_handler(self, str_list): - # if not isinstance(str_list, list): - # raise Exception('The input should be a list!') - # - # conv_list = [] - # for i in range(len(str_list)): - # if not isinstance(str_list[i], str): - # raise Exception('Each element of the input should be a string!') - # user_info = {'first_name': 'NONE'} - # msg_info = {'msg_id': -1, - # 'msg_type': 'command' if str_list[i].startswith('#') else 'text', - # 'msg_source': 'user'} - # msg = Message(user_interface='NONE', - # user_id=-1, - # user_info=user_info, - # msg_info=msg_info, - # text=str_list[i], - # timestamp=datetime.utcnow()) - # conv_list.append(msg) - # conv_list.reverse() - # - # if self.timeout > 0: - # output_msg = func_timeout(self.timeout, self.request_handler_func, args=[conv_list]) - # else: - # output_msg = self.request_handler_func(conv_list) - # return output_msg + def file_request_handler(self, msg: Message): + # Just delegate the call to the live request handler. + return self.live_request_handler(msg) @abstractmethod - def request_handler_func(self, conv_list): + def request_handler_func(self, conv_list: List[Message]) -> Message: pass @abstractmethod def run(self): pass + + @abstractmethod + def generate_actions(self) -> dict: + pass diff --git a/macaw/core/dialogue_manager/__init__.py b/macaw/core/dialogue_manager/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/macaw/core/dialogue_manager/dialogue_manager.py b/macaw/core/dialogue_manager/dialogue_manager.py new file mode 100644 index 0000000..9c5d250 --- /dev/null +++ b/macaw/core/dialogue_manager/dialogue_manager.py @@ -0,0 +1,28 @@ +from core.dialogue_manager.dst import DST, State + + +class DialogManager(object): + def __init__(self, dst=None): + self.dst = DST() if dst is None else dst + + @classmethod + def decode(cls, encoded: dict): + return DialogManager(DST.decode(encoded["dst"])) + + def process_turn(self, nlp_pipeline): + # Get these from the nlp_pipeline output. + input_alphabet = None + context = None + + new_state = self.dst.transition(input_alphabet, context) + if isinstance(new_state, State): + self.dst.update(new_state) + else: + # Save output result in conversation context. + print(f"new_state={new_state}") + pass + + def encode(self) -> dict: + return { + "dst": self.dst.encode() + } diff --git a/macaw/core/dialogue_manager/dst.py b/macaw/core/dialogue_manager/dst.py new file mode 100644 index 0000000..142c60f --- /dev/null +++ b/macaw/core/dialogue_manager/dst.py @@ -0,0 +1,42 @@ +import enum + +from typing import Union + + +class State(enum.Enum): + launch = 1 + recipe_selection = 2 + ingredients = 3 + show_steps = 4 + goodbye = 5 + + +class DST: + def __init__(self, state=None): + self.curr_state = State.launch if state is None else state + + @classmethod + def decode(cls, encoded: dict): + return DST(State(encoded["curr_state"])) + + def transition(self, input_alphabet, context) -> Union[str, State]: + """ + Given the input alphabet and context (and thing that tells about the ongoing conversation), it returns what + should the destination state be. Returns message str if there is no valid transition. + """ + if self.curr_state == State.launch: + if input_alphabet == 'recipe_question': + return State.recipe_selection + elif self.curr_state == State.recipe_selection: + if input_alphabet == 'option_selection': + return State.ingredients + return f"Could not make a transition from {self.curr_state} using input {input_alphabet}." + + def update(self, new_state: State): + self.curr_state = new_state + + def encode(self) -> dict: + return { + "curr_state": self.curr_state.value + } + diff --git a/macaw/core/interaction_handler/msg.py b/macaw/core/interaction_handler/msg.py index 21ae98b..cb851c9 100644 --- a/macaw/core/interaction_handler/msg.py +++ b/macaw/core/interaction_handler/msg.py @@ -1,5 +1,5 @@ """ -The message used to represent each interaction in Macaw. +The message used to represent each interaction in Macaw. It could be either from the user side or from the bot side. Authors: Hamed Zamani (hazamani@microsoft.com), George Wei (gzwei@umass.edu) """ @@ -7,21 +7,23 @@ from typing import Optional, Union, Dict from .attributes import UserAttributes +from ..dialogue_manager.dialogue_manager import DialogManager class Message: def __init__( - self, - user_interface: str, - user_id: Union[str, int], - text: str, - timestamp: datetime, - user_info: Optional[Dict[str, any]] = None, - msg_info: Optional[Dict[str, any]] = None, - actions: Optional[Dict[str, any]] = None, - dialog_state_tracking: Optional[Dict[str, any]] = None, - nlp_pipeline: Optional[Dict[str, any]] = None, - user_attributes: Optional[Dict[str, any]] = None, + self, + user_interface: str, + user_id: Union[str, int], + text: str, + timestamp: datetime, + response: Optional[str] = None, + user_info: Optional[Dict[str, any]] = None, + msg_info: Optional[Dict[str, any]] = None, + actions_result: Optional[Dict[str, any]] = None, + dialog_manager: Optional[DialogManager] = None, + nlp_pipeline_result: Optional[Dict[str, any]] = None, + user_attributes: Optional[Dict[str, any]] = None, ): """ An object for input and output Message. @@ -29,24 +31,26 @@ def __init__( Args: user_interface(str): The interface name used for this message (e.g., 'telegram') user_id(str | int): The user ID. - text(str): The message text. + text(str): The user message text. + response(str): (Optional) The bot message text. None indicates bot response hasn't been generated yet. timestamp(datetime.datetime): The timestamp of message. user_info(dict): (Optional) The dict containing some more information about the user. msg_info(dict): (Optional) The dict containing some more information about the message. - actions(dict): (Optional) The results from the various actions given the conversation history. - dialog_state_tracking(dict): (Optional) The dialog state tracking dict. - nlp_pipeline(dict): (Optional) The results from running the NLP pipeline. + actions_result(dict): (Optional) The results from the various actions given the conversation history. + dialog_manager(DialogManager): (Optional) The dialog manager. + nlp_pipeline_result(dict): (Optional) The results from running the NLP pipeline. user_attributes(dict): (Optional) The user attributes for this given message. """ self.user_interface = user_interface self.user_id = user_id self.text = text + self.response = response self.timestamp = timestamp self.user_info = user_info self.msg_info = msg_info - self.actions = actions - self.dialog_state_tracking = dialog_state_tracking - self.nlp_pipeline = nlp_pipeline + self.actions_result = actions_result + self.dialog_manager = dialog_manager + self.nlp_pipeline_result = nlp_pipeline_result if user_attributes is None: user_attributes = dict() @@ -56,17 +60,27 @@ def __init__( @classmethod def from_dict(cls, msg_dict): """ - Get a Message object from dict. + Get a Message object from serialized dict obtained from database (e.g. MongoDB). Args: msg_dict(dict): A dict containing all the information required to construct a Message object. Returns: A Message object. """ + # Deserialize custom variables of Message class. + msg_dict["dialog_manager"] = DialogManager.decode(msg_dict["dialog_manager"]) + return cls(**msg_dict) def __iter__(self): + """ + This iterable is needed so that Message object can be inserted into a MongoDB table as a dictionary. It should + provide encoding logic for all custom data types used inside Message class. + """ for attr, value in self.__dict__.items(): - yield attr, value if not isinstance( - value, UserAttributes - ) else value.__dict__ # If this doesn't work, default to `None` + value_ret = value + if isinstance(value, UserAttributes): + value_ret = value.__dict__ + elif isinstance(value, DialogManager): + value_ret = value.encode() + yield attr, value_ret diff --git a/macaw/core/interaction_handler/user_requests_db.py b/macaw/core/interaction_handler/user_requests_db.py index 11ae1a0..dfab257 100644 --- a/macaw/core/interaction_handler/user_requests_db.py +++ b/macaw/core/interaction_handler/user_requests_db.py @@ -5,6 +5,7 @@ """ from datetime import datetime, timedelta +from typing import List from pymongo import MongoClient @@ -31,7 +32,7 @@ def get_all(self): ) return self.dict_list_to_msg_list(self.col.find({})) - def get_conv_history(self, user_id, max_time, max_count): + def get_conv_history(self, user_id, max_time, max_count) -> List[Message]: if max_time is None: res = self.col.find({"user_id": user_id}, sort=[("timestamp", -1)]) else: @@ -53,9 +54,10 @@ def close(self): self.client.close() @staticmethod - def dict_list_to_msg_list(msg_dict_list): + def dict_list_to_msg_list(msg_dict_list) -> List[Message]: msg_list = [] for msg_dict in msg_dict_list: msg_dict.pop("_id") - msg_list.append(Message.from_dict(msg_dict=msg_dict)) + message = Message.from_dict(msg_dict=msg_dict) + msg_list.append(message) return msg_list diff --git a/macaw/core/nlp_pipeline/__init__.py b/macaw/core/nlp_pipeline/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/macaw/core/nlp_pipeline/nlp_pipeline.py b/macaw/core/nlp_pipeline/nlp_pipeline.py new file mode 100644 index 0000000..633d9b7 --- /dev/null +++ b/macaw/core/nlp_pipeline/nlp_pipeline.py @@ -0,0 +1,19 @@ +from typing import List + +from core.interaction_handler import Message + + +class NlpPipeline: + def __init__(self, modules: dict): + self.modules = dict() + self.modules.update(modules) + self.context = dict() + + def run(self, conv_list: List[Message]): + for model_name, model in self.modules.items(): + # TODO: pass in required input to every model. + conv = None # get from state_manager + model_output = model.run(conv) + + # TODO: Save output in state_manager. + pass diff --git a/macaw/core/output_handler/naive_output_selection.py b/macaw/core/output_handler/naive_output_selection.py index 1215b83..33fe450 100644 --- a/macaw/core/output_handler/naive_output_selection.py +++ b/macaw/core/output_handler/naive_output_selection.py @@ -53,7 +53,7 @@ def output_selection(self, conv_list, candidate_outputs): return "retrieval" return None - def get_output(self, conv, candidate_outputs): + def get_output(self, conv_list, candidate_outputs): """ The response Message generation method. @@ -67,27 +67,27 @@ def get_output(self, conv, candidate_outputs): Returns: A response Message to be sent to the user. """ - user_id = conv[0].user_id - user_info = conv[0].user_info + user_id = conv_list[0].user_id + user_info = conv_list[0].user_info msg_info = dict() - msg_info["msg_id"] = conv[0].msg_info["msg_id"] + msg_info["msg_id"] = conv_list[0].msg_info["msg_id"] msg_info["msg_source"] = "system" - text = "" - user_interface = conv[0].user_interface + response = "" + user_interface = conv_list[0].user_interface - selected_action = self.output_selection(conv, candidate_outputs) + selected_action = self.output_selection(conv_list, candidate_outputs) if selected_action is None: msg_info["msg_type"] = "text" msg_info["msg_creator"] = "no answer error" - text = "No response has been found! Please try again!" + response = "No response has been found! Please try again!" elif selected_action == "qa": - msg_info["msg_type"] = conv[0].msg_info["msg_type"] + msg_info["msg_type"] = conv_list[0].msg_info["msg_type"] msg_info["msg_creator"] = "qa" - text = candidate_outputs["qa"][0].text + response = candidate_outputs["qa"][0].text elif selected_action == "retrieval": msg_info["msg_type"] = "options" msg_info["msg_creator"] = "retrieval" - text = "Retrieved document list (click to see the document content):" + response = "Retrieved document list (click to see the document content):" msg_info["options"] = [ (output.title, "#get_doc " + output.id, output.score) for output in candidate_outputs["retrieval"] @@ -95,17 +95,18 @@ def get_output(self, conv, candidate_outputs): elif selected_action == "#get_doc": msg_info["msg_type"] = "text" msg_info["msg_creator"] = "#get_doc" - text = candidate_outputs["#get_doc"][0].text + response = candidate_outputs["#get_doc"][0].text else: raise Exception("The candidate output key is not familiar!") timestamp = datetime.utcnow() - if timestamp <= conv[0].timestamp: + if timestamp <= conv_list[0].timestamp: raise Exception("There is a problem in the output timestamp!") return Message( user_interface=user_interface, user_id=user_id, user_info=user_info, msg_info=msg_info, - text=text, + text=conv_list[0].text, + response=response, timestamp=timestamp ) diff --git a/macaw/interface/fileio.py b/macaw/interface/fileio.py index bed01f6..5ef4c53 100644 --- a/macaw/interface/fileio.py +++ b/macaw/interface/fileio.py @@ -5,6 +5,7 @@ """ import time +from datetime import datetime from macaw.core.interaction_handler.msg import Message from macaw.interface.interface import Interface @@ -20,27 +21,24 @@ def run(self): with open(self.params["input_file_path"]) as input_file: for line in input_file: str_list = line.strip().split("\t") - if len(str_list) < 2: + if len(str_list) != 2: raise Exception( - "Each input line should contain at least 2 elements: a query ID and a query text." + f"Each input line should contain at least 2 elements: a query ID and a query text." + f"Invalid line: {line}" ) qid = str_list[0] - conv_list = [] - for i in range(1, len(str_list)): - user_info = {"first_name": "NONE"} - msg_info = {"msg_id": qid, "msg_type": "text", "msg_source": "user"} - msg = Message( - user_interface="NONE", - user_id=-1, - user_info=user_info, - msg_info=msg_info, - text=str_list[i], - timestamp=-1, - ) - conv_list.append(msg) - conv_list.reverse() - output_msg = self.params["experimental_request_handler"](conv_list) + user_info = {"first_name": "NONE"} + msg_info = {"msg_id": qid, "msg_type": "text", "msg_source": "user"} + msg = Message( + user_interface="fileio", + user_id=-1, + user_info=user_info, + msg_info=msg_info, + text=str_list[1], + timestamp=datetime.utcnow(), + ) + output_msg = self.params["experimental_request_handler"](msg) self.result_presentation( output_msg, {"output_file": output_file, "qid": qid} ) @@ -74,7 +72,7 @@ def result_presentation(self, output_msg, params): output_file.write( qid + "\t" - + output_msg.text.replace("\n", " ").replace("\t", " ") + + output_msg.response.replace("\n", " ").replace("\t", " ") + "\n" ) else: diff --git a/macaw/interface/interface.py b/macaw/interface/interface.py index e82ec8a..f867bc4 100644 --- a/macaw/interface/interface.py +++ b/macaw/interface/interface.py @@ -6,6 +6,8 @@ from abc import ABC, abstractmethod +from core.interaction_handler import Message + class Interface(ABC): def __init__(self, params): @@ -16,5 +18,5 @@ def run(self): pass @abstractmethod - def result_presentation(self, response_msg, params): + def result_presentation(self, response_msg: Message, params: dict): pass diff --git a/macaw/interface/stdio.py b/macaw/interface/stdio.py index ff5844e..cc91eb6 100644 --- a/macaw/interface/stdio.py +++ b/macaw/interface/stdio.py @@ -35,24 +35,24 @@ def run(self): msg = Message( user_interface="stdio", user_id=-1, - user_info=user_info, - msg_info=msg_info, text=request, timestamp=datetime.utcnow(), + user_info=user_info, + msg_info=msg_info, ) output = self.params["live_request_handler"](msg) self.result_presentation(output, {}) except Exception as ex: traceback.print_exc() - def result_presentation(self, response_msg, params): + def result_presentation(self, response_msg: Message, params: dict): try: print("THE RESPONSE STARTS") print( "----------------------------------------------------------------------" ) if response_msg.msg_info["msg_type"] == "text": - print(response_msg.text) + print(response_msg.response) elif response_msg.msg_info["msg_type"] == "options": for (option_text, option_data, output_score) in response_msg.msg_info[ "options" diff --git a/macaw/live_main.py b/macaw/live_main.py index 878f296..628f6ad 100644 --- a/macaw/live_main.py +++ b/macaw/live_main.py @@ -3,12 +3,12 @@ Authors: Hamed Zamani (hazamani@microsoft.com) """ +import argparse +from typing import List +from core.interaction_handler import Message from macaw.cis import CIS from macaw.core import mrc, retrieval -from macaw.core.input_handler.action_detection import RequestDispatcher -from macaw.core.interaction_handler import CurrentAttributes -from macaw.core.output_handler import naive_output_selection from macaw.util.logging import Logger @@ -27,14 +27,14 @@ def __init__(self, params): super().__init__(params) self.logger = params["logger"] self.logger.info("Conversational QA Model... starting up...") - self.params["current_attributes"] = CurrentAttributes() - self.retrieval = retrieval.get_retrieval_model(params=self.params) - self.qa = mrc.get_mrc_model(params=self.params) - self.params["actions"] = {"retrieval": self.retrieval, "qa": self.qa} - self.request_dispatcher = RequestDispatcher(self.params) - self.output_selection = naive_output_selection.NaiveOutputProcessing({}) - - def request_handler_func(self, conv_list): + + def generate_actions(self) -> dict: + return { + "retrieval": retrieval.get_retrieval_model(params=self.params), + "qa": mrc.get_mrc_model(params=self.params) + } + + def request_handler_func(self, conv_list: List[Message]) -> Message: """ This function is called for each conversational interaction made by the user. In fact, this function calls the dispatcher to send the user request to the information seeking components. @@ -47,8 +47,23 @@ def request_handler_func(self, conv_list): output_msg(Message): Returns an output message that should be sent to the UI to be presented to the user. """ self.logger.info(conv_list) + + self.nlp_pipeline.run(conv_list) + + # Run the DST module here. Save the output locally. + nlp_pipeline_output = None + self.dialogue_manager.process_turn(nlp_pipeline_output) + + # TODO: request dispatcher should also use DST output. dispatcher_output = self.request_dispatcher.dispatch(conv_list) + output_msg = self.output_selection.get_output(conv_list, dispatcher_output) + + # Save nlp_pipeline result, action result and dialogue_manager so that they are persisted in DB. + output_msg.nlp_pipeline_result = nlp_pipeline_output + output_msg.actions_result = None + output_msg.dialog_manager = self.dialogue_manager + return output_msg def run(self): @@ -59,9 +74,16 @@ def run(self): if __name__ == "__main__": + # Parse input arguments. + parser = argparse.ArgumentParser(description="Run live_main.py file") + parser.add_argument("--mode", type=str, default="live", help="live or exp (experimental)") + parser.add_argument("--interface", type=str, default="stdio", help="can be 'telegram' or 'stdio' for live mode, and" + " 'fileio' for exp mode") + args = parser.parse_args() + basic_params = { "timeout": 15, # timeout is in terms of second. - "mode": "live", # mode can be either live or exp. + "mode": args.mode, # mode can be either live or exp. "logger": Logger({}), } # for logging into file, pass the filepath to the Logger class. @@ -75,8 +97,11 @@ def run(self): # These are interface parameters. They are interface specific. interface_params = { - "interface": "stdio", # interface can be 'telegram' or 'stdio' for live mode, and 'fileio' + "interface": args.interface, # interface can be 'telegram' or 'stdio' for live mode, and 'fileio' # for experimental mode. + "input_file_path": "/usr/src/app/data/file_input.txt", + "output_file_path": "/usr/src/app/data/file_output.txt", + "output_format": "text", "bot_token": "YOUR_TELEGRAM_BOT_TOKEN", # Telegram bot token. # 'asr_model': 'google', # The API used for speech recognition. # 'asg_model': 'google', # The API used for speech generation. diff --git a/scripts/start.sh b/scripts/start.sh index f1034dc..9f2b727 100644 --- a/scripts/start.sh +++ b/scripts/start.sh @@ -9,4 +9,4 @@ echo "MongoDB started successfully." echo "This is the python version: $(python3 --version)" # Start the main application. -python3 macaw/live_main.py +python3 macaw/live_main.py --mode live --interface stdio From d824ed17e5360eb518a98911be9d4e806893ba85 Mon Sep 17 00:00:00 2001 From: Amit Hattimare Date: Fri, 8 Jul 2022 13:01:29 -0700 Subject: [PATCH 09/10] Feature/nlp pipeline (#8) Changes made: 1. Added support for remote NLP modules running as separate Docker containers. 2. Added support for remote and local Response generator modules. Added a local Punctuation RG using Hugging for proof-of-concept. 3. Integrated docker-compose to start the entire application. 4. Updated Readme file. 5. Created a separate global logger without the need to pass it as a variable. 6. Added multiprocessing for NLP pipeline modules run. 7. (minor) Using exponential backoff for NLPUtil as it often fails during initialization. This will be replaced with RG/NLP pipelines later. Co-authored-by: Amit Hattimare --- README.md | 34 ++++++++++ data/file_input.txt | 2 +- docker-compose.yml | 26 ++++++++ macaw/batch_exp_main.py | 2 +- macaw/cis.py | 18 +++-- .../core/dialogue_manager/dialogue_manager.py | 7 +- macaw/core/mrc/__init__.py | 4 +- macaw/core/nlp_pipeline/nlp_pipeline.py | 65 ++++++++++++++++--- .../{input_handler => response}/__init__.py | 0 .../action_detection.py | 7 +- .../{input_handler => response}/actions.py | 0 macaw/core/response/docker.py | 30 +++++++++ macaw/core/response/handler.py | 37 +++++++++++ macaw/core/response/punctuation.py | 34 ++++++++++ macaw/core/response/response_generator.py | 14 ++++ macaw/core/retrieval/__init__.py | 13 ++-- macaw/docker/flask_app/Dockerfile | 38 +++++++++++ macaw/docker/flask_app/app/app.py | 64 ++++++++++++++++++ macaw/docker/flask_app/app/remote_module.py | 15 +++++ macaw/docker/flask_app/app/requirements.txt | 11 ++++ macaw/docker/flask_app/config/flask.conf | 9 +++ macaw/docker/flask_app/config/gunicorn.conf | 3 + .../docker/flask_app/config/supervisord.conf | 18 +++++ macaw/docker/ic_app/Dockerfile | 38 +++++++++++ macaw/docker/ic_app/app/app.py | 64 ++++++++++++++++++ macaw/docker/ic_app/app/remote_module.py | 15 +++++ macaw/docker/ic_app/app/requirements.txt | 11 ++++ macaw/docker/ic_app/config/flask.conf | 9 +++ macaw/docker/ic_app/config/gunicorn.conf | 3 + macaw/docker/ic_app/config/supervisord.conf | 18 +++++ macaw/docker/qa_app/Dockerfile | 38 +++++++++++ macaw/docker/qa_app/app/app.py | 64 ++++++++++++++++++ macaw/docker/qa_app/app/remote_module.py | 15 +++++ macaw/docker/qa_app/app/requirements.txt | 15 +++++ macaw/docker/qa_app/config/flask.conf | 9 +++ macaw/docker/qa_app/config/gunicorn.conf | 3 + macaw/docker/qa_app/config/supervisord.conf | 18 +++++ macaw/interface/fileio.py | 5 +- macaw/interface/stdio.py | 1 + macaw/live_main.py | 56 +++++++++++----- macaw/util/__init__.py | 4 ++ macaw/util/custom_logging.py | 28 ++++++++ macaw/util/logging.py | 28 -------- macaw/wizard_of_oz_main.py | 22 +++---- requirements.txt | 2 + scripts/start.sh | 4 +- setup.py | 2 +- 47 files changed, 833 insertions(+), 90 deletions(-) create mode 100644 docker-compose.yml rename macaw/core/{input_handler => response}/__init__.py (100%) rename macaw/core/{input_handler => response}/action_detection.py (95%) rename macaw/core/{input_handler => response}/actions.py (100%) create mode 100644 macaw/core/response/docker.py create mode 100644 macaw/core/response/handler.py create mode 100644 macaw/core/response/punctuation.py create mode 100644 macaw/core/response/response_generator.py create mode 100644 macaw/docker/flask_app/Dockerfile create mode 100644 macaw/docker/flask_app/app/app.py create mode 100644 macaw/docker/flask_app/app/remote_module.py create mode 100644 macaw/docker/flask_app/app/requirements.txt create mode 100644 macaw/docker/flask_app/config/flask.conf create mode 100644 macaw/docker/flask_app/config/gunicorn.conf create mode 100644 macaw/docker/flask_app/config/supervisord.conf create mode 100644 macaw/docker/ic_app/Dockerfile create mode 100644 macaw/docker/ic_app/app/app.py create mode 100644 macaw/docker/ic_app/app/remote_module.py create mode 100644 macaw/docker/ic_app/app/requirements.txt create mode 100644 macaw/docker/ic_app/config/flask.conf create mode 100644 macaw/docker/ic_app/config/gunicorn.conf create mode 100644 macaw/docker/ic_app/config/supervisord.conf create mode 100644 macaw/docker/qa_app/Dockerfile create mode 100644 macaw/docker/qa_app/app/app.py create mode 100644 macaw/docker/qa_app/app/remote_module.py create mode 100644 macaw/docker/qa_app/app/requirements.txt create mode 100644 macaw/docker/qa_app/config/flask.conf create mode 100644 macaw/docker/qa_app/config/gunicorn.conf create mode 100644 macaw/docker/qa_app/config/supervisord.conf create mode 100644 macaw/util/custom_logging.py delete mode 100644 macaw/util/logging.py diff --git a/README.md b/README.md index aa6bf49..add050d 100644 --- a/README.md +++ b/README.md @@ -160,6 +160,40 @@ Tantivy index is created using the document stored in `trec_documents/` director bigger index, download the entire data from [archive](https://archive.org/details/trec-ir) and put it in trec_documents. Docker will copy it during build time and create a new index. +## Running entire Macuna application + +Using `docker compose` we can start the main application and all other supporting docker containers (nlp pipeline +applications and remote modules) at once. This does not work with stdio mode as docker compose does not support +terminal input. Run the below command. + +```commandline +docker compose build && docker compose up +``` + +To run different containers independently or to support terminal input, run the below commands in order. + +First, build the application. + +```commandline +docker compose build +``` + +Second, start all the supporting remote modules. Make sure to explicitly provide port and container names. This can be +found from the `docker-compose.yml` file. + +```commandline +docker compose run --rm -p "127.0.0.1:8001:80" --name nlp-pipeline-app-flask nlp-pipeline-app-flask +docker compose run --rm -p "127.0.0.1:8002:80" --name nlp-pipeline-app-ic nlp-pipeline-app-ic +docker compose run --rm -p "127.0.0.1:8003:80" --name response-generator-app-qa response-generator-app-qa +``` + +Third, run the main application which has stdio (or it can also have fileio). For stdio update the flags in `start.sh` +with `--mode live --interface stdio`. + +```commandline +docker compose run --rm base-app +``` + ## Local Setup To setup the package locally without using Docker, follow the below instructions. diff --git a/data/file_input.txt b/data/file_input.txt index ea79eaa..5a3ab77 100644 --- a/data/file_input.txt +++ b/data/file_input.txt @@ -1,4 +1,4 @@ -q1 who is the president of japan? +q1 who is the president of Japan? q2 when did the gulf war start? q3 what rights did the kingdom restore with israel? q4 when did jordan regained sovereignty over its territories? \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..b4a9c6b --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,26 @@ +version: '3.4' +services: + base-app: + build: ./ + ports: + - "127.0.0.1:8000:80" + volumes: + - "/Users/amitgh/PycharmProjects/DrQA/data:/usr/src/app/DrQA/data" + - "/Users/amitgh/PycharmProjects/Maruna/macaw/stanford-corenlp-full-2017-06-09:/usr/src/app/stanford-corenlp-full-2017-06-09" + - "/Users/amitgh/PycharmProjects/Maruna/macaw/data:/usr/src/app/data" + deploy: + resources: + limits: + memory: 5gb + nlp-pipeline-app-flask: + build: ./macaw/docker/flask_app + ports: + - "127.0.0.1:8001:80" + nlp-pipeline-app-ic: + build: ./macaw/docker/ic_app + ports: + - "127.0.0.1:8002:80" + response-generator-app-qa: + build: ./macaw/docker/qa_app + ports: + - "127.0.0.1:8003:80" diff --git a/macaw/batch_exp_main.py b/macaw/batch_exp_main.py index 9e61b96..8cad6b3 100644 --- a/macaw/batch_exp_main.py +++ b/macaw/batch_exp_main.py @@ -6,7 +6,7 @@ from macaw.cis import CIS from macaw.core import mrc, retrieval -from macaw.core.input_handler.action_detection import RequestDispatcher +from macaw.core.response.action_detection import RequestDispatcher from macaw.core.output_handler import naive_output_selection diff --git a/macaw/cis.py b/macaw/cis.py index 67d634e..c2a0f5d 100644 --- a/macaw/cis.py +++ b/macaw/cis.py @@ -1,17 +1,14 @@ -""" -The CIS class. - -Authors: Hamed Zamani (hazamani@microsoft.com) -""" from abc import ABC, abstractmethod from datetime import datetime from typing import List +import logging from func_timeout import FunctionTimedOut +from core.response.handler import ResponseGeneratorHandler from macaw import interface, util from macaw.core.dialogue_manager.dialogue_manager import DialogManager -from macaw.core.input_handler.action_detection import RequestDispatcher +from macaw.core.response.action_detection import RequestDispatcher from macaw.core.interaction_handler import CurrentAttributes from macaw.core.interaction_handler.msg import Message from macaw.core.interaction_handler.user_requests_db import InteractionDB @@ -40,21 +37,22 @@ def __init__(self, params): dbname=self.params["interaction_db_name"], ) + self.logger = logging.getLogger("MacawLogger") self.params["curr_attrs"] = self.curr_attrs = CurrentAttributes() self.params["actions"] = self.generate_actions() self.interface = interface.get_interface(params) self.request_dispatcher = RequestDispatcher(self.params) self.output_selection = naive_output_selection.NaiveOutputProcessing({}) self.dialogue_manager = DialogManager() - self.nlp_pipeline = NlpPipeline(params.get("nlp_modules", {})) + self.nlp_pipeline = NlpPipeline(params.get("nlp_models", {})) + self.response_generator_handler = ResponseGeneratorHandler(params.get("response_generator_models", {})) try: self.nlp_util = util.NLPUtil(self.params) self.params["nlp_util"] = self.nlp_util except Exception as ex: - self.params["logger"].warning( - "WARNING: There is a problem with setting up the NLP utility module. " - + str(ex) + self.logger.warning( + f"There is a problem with setting up the NLP utility module. {type(ex)}, {ex}" ) self.timeout = self.params["timeout"] if "timeout" in self.params else -1 diff --git a/macaw/core/dialogue_manager/dialogue_manager.py b/macaw/core/dialogue_manager/dialogue_manager.py index 9c5d250..d64f48f 100644 --- a/macaw/core/dialogue_manager/dialogue_manager.py +++ b/macaw/core/dialogue_manager/dialogue_manager.py @@ -9,7 +9,7 @@ def __init__(self, dst=None): def decode(cls, encoded: dict): return DialogManager(DST.decode(encoded["dst"])) - def process_turn(self, nlp_pipeline): + def process_turn(self, nlp_pipeline_output: dict): # Get these from the nlp_pipeline output. input_alphabet = None context = None @@ -17,10 +17,11 @@ def process_turn(self, nlp_pipeline): new_state = self.dst.transition(input_alphabet, context) if isinstance(new_state, State): self.dst.update(new_state) - else: - # Save output result in conversation context. print(f"new_state={new_state}") + else: + print(f"Couldn't change state: {new_state}") pass + # Save output result in conversation context. def encode(self) -> dict: return { diff --git a/macaw/core/mrc/__init__.py b/macaw/core/mrc/__init__.py index a56086b..587747a 100644 --- a/macaw/core/mrc/__init__.py +++ b/macaw/core/mrc/__init__.py @@ -3,6 +3,7 @@ Authors: Hamed Zamani (hazamani@microsoft.com) """ +import logging from macaw.core.mrc import drqa_mrc @@ -17,7 +18,8 @@ def get_mrc_model(params): Returns: An MRC object for machine reading comprehension. """ - params['logger'].info('The MRC model for QA: ' + params['mrc']) + logger = logging.getLogger("MacawLogger") + logger.info('The MRC model for QA: ' + params['mrc']) if params['mrc'] == 'drqa': return drqa_mrc.DrQA(params) else: diff --git a/macaw/core/nlp_pipeline/nlp_pipeline.py b/macaw/core/nlp_pipeline/nlp_pipeline.py index 633d9b7..358ade8 100644 --- a/macaw/core/nlp_pipeline/nlp_pipeline.py +++ b/macaw/core/nlp_pipeline/nlp_pipeline.py @@ -1,19 +1,68 @@ +import json +import logging +import multiprocessing +import time +from multiprocessing import Pool from typing import List +import requests + from core.interaction_handler import Message +class RemoteModel: + def __init__(self, model_name: str, endpoint: str): + self.model_name = model_name + self.endpoint = endpoint + + def run(self, request: dict) -> dict: + try: + response = requests.post(url=self.endpoint, data=json.dumps(request)) + return response.json() + except Exception as e: + return { + "response": f"Error in post request call for {self.model_name}: {e}", + "error": True + } + + class NlpPipeline: def __init__(self, modules: dict): self.modules = dict() - self.modules.update(modules) - self.context = dict() + self.logger = logging.getLogger("MacawLogger") + for model_name, endpoint in modules.items(): + self.modules[model_name] = RemoteModel(model_name, endpoint) def run(self, conv_list: List[Message]): - for model_name, model in self.modules.items(): - # TODO: pass in required input to every model. - conv = None # get from state_manager - model_output = model.run(conv) + """ + Runs all the models and saves their results in the latest conversation (conv_list[0]) message. + """ + nlp_pipeline_result = {} + + with Pool(processes=4) as pool: + all_async_results = [] + for model_name, model in self.modules.items(): + # pass in required input to every model. + model_input = {"text": "input text for model"} + async_result = pool.apply_async( + func=model.run, + args=(model_input,) + ) + all_async_results.append((model_name, async_result)) + pool.close() + + max_wait_time_in_secs = 10 # wait time for all modules combined. + end_time = time.time() + max_wait_time_in_secs + for model_name, async_result in all_async_results: + try: + nlp_pipeline_result[model_name] = async_result.get(timeout=max(end_time - time.time(), 0)) + except multiprocessing.TimeoutError as te: + resp_msg = f"Module {model_name} timed out." + self.logger.error(f"{resp_msg} {te}") + nlp_pipeline_result[model_name] = { + "response": resp_msg, + "error": True, + } + pool.join() - # TODO: Save output in state_manager. - pass + conv_list[0].nlp_pipeline_result = nlp_pipeline_result diff --git a/macaw/core/input_handler/__init__.py b/macaw/core/response/__init__.py similarity index 100% rename from macaw/core/input_handler/__init__.py rename to macaw/core/response/__init__.py diff --git a/macaw/core/input_handler/action_detection.py b/macaw/core/response/action_detection.py similarity index 95% rename from macaw/core/input_handler/action_detection.py rename to macaw/core/response/action_detection.py index b7e8743..dc4810e 100644 --- a/macaw/core/input_handler/action_detection.py +++ b/macaw/core/response/action_detection.py @@ -6,7 +6,7 @@ import multiprocessing -from macaw.core.input_handler import actions +from macaw.core.response import actions class PreActionRequestDispatcher: @@ -99,7 +99,10 @@ def dispatch(self, conv_list): manager = multiprocessing.Manager() action_results = manager.dict() for action in self.params['actions']: - p = multiprocessing.Process(target=actions.run_action, args=[action, conv_list.copy(), self.params, action_results]) + p = multiprocessing.Process( + target=actions.run_action, + args=[action, conv_list.copy(), self.params, action_results] + ) action_processes.append(p) p.start() diff --git a/macaw/core/input_handler/actions.py b/macaw/core/response/actions.py similarity index 100% rename from macaw/core/input_handler/actions.py rename to macaw/core/response/actions.py diff --git a/macaw/core/response/docker.py b/macaw/core/response/docker.py new file mode 100644 index 0000000..fd75cdc --- /dev/null +++ b/macaw/core/response/docker.py @@ -0,0 +1,30 @@ +import json + +import requests + +from core.response.response_generator import ResponseGenerator + + +class ResponseGeneratorDocker(ResponseGenerator): + """ + A class that encapsulates an ML model running in a local docker container. + """ + + def __init__(self, name: str, endpoint: str): + super().__init__(name) + self.endpoint = endpoint + + def run(self, conv_list) -> dict: + # extract request from the conversation. + request = { + "text": "input message for RG docker model." + } + + try: + response = requests.post(url=self.endpoint, data=json.dumps(request)) + return response.json() + except Exception as e: + return { + "response": f"Error in post request call for {self.name}: {e}", + "error": True + } diff --git a/macaw/core/response/handler.py b/macaw/core/response/handler.py new file mode 100644 index 0000000..095ed96 --- /dev/null +++ b/macaw/core/response/handler.py @@ -0,0 +1,37 @@ +import logging +from typing import List + +from core.response.docker import ResponseGeneratorDocker +from core.response.response_generator import ResponseGenerator + + +class ResponseGeneratorHandler: + def __init__(self, rg_models: dict): + self.rg_models = dict() + self.logger = logging.getLogger("MacawLogger") + + for model_name, model in rg_models.items(): + if isinstance(model, ResponseGenerator): + self.rg_models[model_name] = model + elif isinstance(model, str) and model.startswith("http://"): + self.rg_models[model_name] = ResponseGeneratorDocker(model_name, model) + else: + self.logger.warning(f"Response generator {model_name}:{model} is not supported.") + + def models_selector(self, conv_list) -> List[str]: + selected_models = [] + for model_name in self.rg_models: + if model_name == "qa": + # decide if qa RG should be run or not here. + selected_models.append(model_name) + elif model_name == "punctuation": + selected_models.append(model_name) + else: + self.logger.warning(f"Model selector not written for {model_name}. Ignoring the model.") + return selected_models + + def run_models(self, model_names: List[str], conv_list) -> dict: + models_response = dict() + for model_name in model_names: + models_response[model_name] = self.rg_models[model_name].run(conv_list) + return models_response diff --git a/macaw/core/response/punctuation.py b/macaw/core/response/punctuation.py new file mode 100644 index 0000000..ef12b86 --- /dev/null +++ b/macaw/core/response/punctuation.py @@ -0,0 +1,34 @@ +import logging +import time + +from macaw.util.custom_logging import LoggerFactory +from deepmultilingualpunctuation import PunctuationModel +from core.response.response_generator import ResponseGenerator + + +class ResponseGeneratorPunctuation(ResponseGenerator): + def __init__(self, name): + super().__init__(name) + self.logger = LoggerFactory.create_logger(params={ + "logger_name": "macaw.core.response.response_generator.punctuation", + "logger_level": logging.INFO + }) + + self.logger.info("Going to download punctuation ML model. This takes time.") + # self.model = PunctuationModel() + self.logger.info("Punctuation model loaded successfully.") + + def run(self, conv_list) -> dict: + # generate input text from the input conversation. + input_text = "My name is Clara and I live in Berkeley California Ist das eine Frage Frau Müller" + + t0 = time.time() + # result = self.model.restore_punctuation(input_text) + result = "punctuation model output." + duration = time.time() - t0 + + return { + "response": f"Local RG: {self.name}. {result}", + "error": False, + "performance": duration + } diff --git a/macaw/core/response/response_generator.py b/macaw/core/response/response_generator.py new file mode 100644 index 0000000..a97aef7 --- /dev/null +++ b/macaw/core/response/response_generator.py @@ -0,0 +1,14 @@ +from abc import ABC, abstractmethod + + +class ResponseGenerator(ABC): + """ + An abstract class for response generator (RG). It can be implemented as a local RG using a local class or as a + remote RG using a docker container to generate the response. + """ + def __init__(self, name: str): + self.name = name + + @abstractmethod + def run(self, conv_list) -> dict: + pass diff --git a/macaw/core/retrieval/__init__.py b/macaw/core/retrieval/__init__.py index 6ed9fdd..2c0dabe 100644 --- a/macaw/core/retrieval/__init__.py +++ b/macaw/core/retrieval/__init__.py @@ -3,6 +3,8 @@ Authors: Hamed Zamani (hazamani@microsoft.com) """ +import logging + import macaw.core.retrieval.bing_api import macaw.core.retrieval.tantivy from macaw.core.retrieval import query_generation @@ -12,7 +14,7 @@ def get_retrieval_model(params): """ This method returns the Retrieval class requested in the parameter dict. Args: - params(dict): A dict of parameters. In this method, the parameters 'logger' and 'query_generation', and + params(dict): A dict of parameters. In this method, the parameters 'query_generation', and 'search_engine' are required. Based on the requested retrievel model, some more parameters may be mandatory. Currently, Macaw serves two different search engines. One is based on indri (http://lemurproject.org/indri.php), and the other one is the Microsoft Bing API. If you want to retrieve results from your own document collection, @@ -21,7 +23,8 @@ def get_retrieval_model(params): Returns: A Retrieval object for document retrieval. """ - params["logger"].info( + logger = logging.getLogger("MacawLogger") + logger.info( "The query generation model for retrieval: " + params["query_generation"] ) if params["query_generation"] == "simple": @@ -29,7 +32,7 @@ def get_retrieval_model(params): else: raise Exception("The requested query generation model does not exist!") - params["logger"].info("The search engine for retrieval: " + params["search_engine"]) + logger.info("The search engine for retrieval: " + params["search_engine"]) if params["search_engine"] == "tantivy": return macaw.core.retrieval.tantivy.Tantivy( { @@ -37,7 +40,7 @@ def get_retrieval_model(params): "path": params["search_engine_path"], "load": True, "results_requested": params["results_requested"], - "logger": params["logger"], + "logger": logger, } ) elif params["search_engine"] == "bing": @@ -46,7 +49,7 @@ def get_retrieval_model(params): "query_generation": q_generation, "bing_key": params["bing_key"], "results_requested": params["results_requested"], - "logger": params["logger"], + "logger": logger, } ) else: diff --git a/macaw/docker/flask_app/Dockerfile b/macaw/docker/flask_app/Dockerfile new file mode 100644 index 0000000..dce887d --- /dev/null +++ b/macaw/docker/flask_app/Dockerfile @@ -0,0 +1,38 @@ +FROM python:3.8.13 + +RUN apt-get update && apt-get install -y \ + nginx \ + supervisor \ + gcc \ + g++ \ + curl \ + python3-pip \ + vim + +# update pip +RUN pip3 install pip --upgrade + +# Setup flask application +RUN mkdir -p /deploy/app +# copy and install requirements before so that other changes do not require +# downloading requirements again. +COPY app/requirements.txt /deploy/app/requirements.txt +RUN pip3 install -r /deploy/app/requirements.txt +COPY app /deploy/app + +# Setup nginx +RUN rm /etc/nginx/sites-enabled/default +COPY config/flask.conf /etc/nginx/sites-available/ +RUN ln -s /etc/nginx/sites-available/flask.conf /etc/nginx/sites-enabled/flask.conf +RUN echo "daemon off;" >> /etc/nginx/nginx.conf + +RUN ln -s /usr/local/bin/gunicorn /usr/bin/gunicorn +# Setup supervisord +RUN mkdir -p /var/log/supervisor +COPY config/supervisord.conf /etc/supervisor/conf.d/supervisord.conf +#COPY config/gunicorn.conf /etc/supervisor/conf.d/gunicorn.conf + +EXPOSE 80 + +# Start processes +CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"] diff --git a/macaw/docker/flask_app/app/app.py b/macaw/docker/flask_app/app/app.py new file mode 100644 index 0000000..9adbaef --- /dev/null +++ b/macaw/docker/flask_app/app/app.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import os +import time +import logging +import sys +from typing import Optional + +from flask import Flask, request, Response +from flask_restful import reqparse, Api, Resource + +import remote_module + +app = Flask("remote module") +api = Api(app) + +app.logger.addHandler(logging.StreamHandler(sys.stdout)) +app.logger.setLevel(logging.DEBUG) + + +class RemoteModule(Resource): + + def get(self): + return 200 + + def post(self): + t0 = time.time() + + args = request.get_json(force=True) + print(f"post request arguments {args}") + validation = self.__validate_input(args) + if validation: + return validation, 500 + + ret = {} + ret.update(self.__get_response(args)) + ret["performance"] = time.time() - t0 + ret["error"] = False + return ret, 200 + + @staticmethod + def __validate_input(args: dict) -> Optional[dict]: + message = "" + for ctx in remote_module.get_required_context(): + if not args.get(ctx): + message = "Context missing: " + ctx + if message: + return { + "message": message, + "error": True + } + return None + + @staticmethod + def __get_response(msg) -> dict: + response = remote_module.handle_message(msg) + app.logger.info("remote model result: %s", response) + return response + + +api.add_resource(RemoteModule, '/') + +if __name__ == '__main__': + app.run(host="127.0.0.1", port=os.environ.get('REMOTE_MODULE_PORT') or 8001) diff --git a/macaw/docker/flask_app/app/remote_module.py b/macaw/docker/flask_app/app/remote_module.py new file mode 100644 index 0000000..589f223 --- /dev/null +++ b/macaw/docker/flask_app/app/remote_module.py @@ -0,0 +1,15 @@ + +required_context = ["text"] + + +def get_required_context(): + return required_context + + +def handle_message(msg: dict) -> dict: + """ + Returns the response dictionary. Throws appropriate error if the model cannot generate a response. + """ + return { + "response": "okay from flask app." + } diff --git a/macaw/docker/flask_app/app/requirements.txt b/macaw/docker/flask_app/app/requirements.txt new file mode 100644 index 0000000..dea81b5 --- /dev/null +++ b/macaw/docker/flask_app/app/requirements.txt @@ -0,0 +1,11 @@ +Flask==2.1.2 +Flask-RESTful==0.3.9 +Werkzeug +future +jsonpickle +numpy +pandoc +six +typing +ConfigArgParse +gunicorn diff --git a/macaw/docker/flask_app/config/flask.conf b/macaw/docker/flask_app/config/flask.conf new file mode 100644 index 0000000..3b68b1c --- /dev/null +++ b/macaw/docker/flask_app/config/flask.conf @@ -0,0 +1,9 @@ +server { + listen 80; + + location / { + proxy_pass http://localhost:5001/; + proxy_set_header Host $host; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + } +} diff --git a/macaw/docker/flask_app/config/gunicorn.conf b/macaw/docker/flask_app/config/gunicorn.conf new file mode 100644 index 0000000..bfecb41 --- /dev/null +++ b/macaw/docker/flask_app/config/gunicorn.conf @@ -0,0 +1,3 @@ +[program:gunicorn] +command=/usr/bin/gunicorn app:app -b localhost:5001 +directory=/deploy/app diff --git a/macaw/docker/flask_app/config/supervisord.conf b/macaw/docker/flask_app/config/supervisord.conf new file mode 100644 index 0000000..a32f072 --- /dev/null +++ b/macaw/docker/flask_app/config/supervisord.conf @@ -0,0 +1,18 @@ +[supervisord] +nodaemon=true +user=root + +[program:nginx] +command=/usr/sbin/nginx +stdout_logfile=/dev/stdout +stdout_logfile_maxbytes=0 +stderr_logfile=/dev/stderr +stderr_logfile_maxbytes=0 + +[program:gunicorn] +command=/usr/bin/gunicorn app:app -w 1 -b localhost:5001 --log-level info +directory=/deploy/app +stdout_logfile=/dev/stdout +stdout_logfile_maxbytes=0 +stderr_logfile=/dev/stderr +stderr_logfile_maxbytes=0 diff --git a/macaw/docker/ic_app/Dockerfile b/macaw/docker/ic_app/Dockerfile new file mode 100644 index 0000000..dce887d --- /dev/null +++ b/macaw/docker/ic_app/Dockerfile @@ -0,0 +1,38 @@ +FROM python:3.8.13 + +RUN apt-get update && apt-get install -y \ + nginx \ + supervisor \ + gcc \ + g++ \ + curl \ + python3-pip \ + vim + +# update pip +RUN pip3 install pip --upgrade + +# Setup flask application +RUN mkdir -p /deploy/app +# copy and install requirements before so that other changes do not require +# downloading requirements again. +COPY app/requirements.txt /deploy/app/requirements.txt +RUN pip3 install -r /deploy/app/requirements.txt +COPY app /deploy/app + +# Setup nginx +RUN rm /etc/nginx/sites-enabled/default +COPY config/flask.conf /etc/nginx/sites-available/ +RUN ln -s /etc/nginx/sites-available/flask.conf /etc/nginx/sites-enabled/flask.conf +RUN echo "daemon off;" >> /etc/nginx/nginx.conf + +RUN ln -s /usr/local/bin/gunicorn /usr/bin/gunicorn +# Setup supervisord +RUN mkdir -p /var/log/supervisor +COPY config/supervisord.conf /etc/supervisor/conf.d/supervisord.conf +#COPY config/gunicorn.conf /etc/supervisor/conf.d/gunicorn.conf + +EXPOSE 80 + +# Start processes +CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"] diff --git a/macaw/docker/ic_app/app/app.py b/macaw/docker/ic_app/app/app.py new file mode 100644 index 0000000..9adbaef --- /dev/null +++ b/macaw/docker/ic_app/app/app.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import os +import time +import logging +import sys +from typing import Optional + +from flask import Flask, request, Response +from flask_restful import reqparse, Api, Resource + +import remote_module + +app = Flask("remote module") +api = Api(app) + +app.logger.addHandler(logging.StreamHandler(sys.stdout)) +app.logger.setLevel(logging.DEBUG) + + +class RemoteModule(Resource): + + def get(self): + return 200 + + def post(self): + t0 = time.time() + + args = request.get_json(force=True) + print(f"post request arguments {args}") + validation = self.__validate_input(args) + if validation: + return validation, 500 + + ret = {} + ret.update(self.__get_response(args)) + ret["performance"] = time.time() - t0 + ret["error"] = False + return ret, 200 + + @staticmethod + def __validate_input(args: dict) -> Optional[dict]: + message = "" + for ctx in remote_module.get_required_context(): + if not args.get(ctx): + message = "Context missing: " + ctx + if message: + return { + "message": message, + "error": True + } + return None + + @staticmethod + def __get_response(msg) -> dict: + response = remote_module.handle_message(msg) + app.logger.info("remote model result: %s", response) + return response + + +api.add_resource(RemoteModule, '/') + +if __name__ == '__main__': + app.run(host="127.0.0.1", port=os.environ.get('REMOTE_MODULE_PORT') or 8001) diff --git a/macaw/docker/ic_app/app/remote_module.py b/macaw/docker/ic_app/app/remote_module.py new file mode 100644 index 0000000..bff776c --- /dev/null +++ b/macaw/docker/ic_app/app/remote_module.py @@ -0,0 +1,15 @@ + +required_context = ["text"] + + +def get_required_context(): + return required_context + + +def handle_message(msg: dict) -> dict: + """ + Returns the response dictionary. Throws appropriate error if the model cannot generate a response. + """ + return { + "response": "okay from ic model." + } diff --git a/macaw/docker/ic_app/app/requirements.txt b/macaw/docker/ic_app/app/requirements.txt new file mode 100644 index 0000000..dea81b5 --- /dev/null +++ b/macaw/docker/ic_app/app/requirements.txt @@ -0,0 +1,11 @@ +Flask==2.1.2 +Flask-RESTful==0.3.9 +Werkzeug +future +jsonpickle +numpy +pandoc +six +typing +ConfigArgParse +gunicorn diff --git a/macaw/docker/ic_app/config/flask.conf b/macaw/docker/ic_app/config/flask.conf new file mode 100644 index 0000000..3b68b1c --- /dev/null +++ b/macaw/docker/ic_app/config/flask.conf @@ -0,0 +1,9 @@ +server { + listen 80; + + location / { + proxy_pass http://localhost:5001/; + proxy_set_header Host $host; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + } +} diff --git a/macaw/docker/ic_app/config/gunicorn.conf b/macaw/docker/ic_app/config/gunicorn.conf new file mode 100644 index 0000000..bfecb41 --- /dev/null +++ b/macaw/docker/ic_app/config/gunicorn.conf @@ -0,0 +1,3 @@ +[program:gunicorn] +command=/usr/bin/gunicorn app:app -b localhost:5001 +directory=/deploy/app diff --git a/macaw/docker/ic_app/config/supervisord.conf b/macaw/docker/ic_app/config/supervisord.conf new file mode 100644 index 0000000..a32f072 --- /dev/null +++ b/macaw/docker/ic_app/config/supervisord.conf @@ -0,0 +1,18 @@ +[supervisord] +nodaemon=true +user=root + +[program:nginx] +command=/usr/sbin/nginx +stdout_logfile=/dev/stdout +stdout_logfile_maxbytes=0 +stderr_logfile=/dev/stderr +stderr_logfile_maxbytes=0 + +[program:gunicorn] +command=/usr/bin/gunicorn app:app -w 1 -b localhost:5001 --log-level info +directory=/deploy/app +stdout_logfile=/dev/stdout +stdout_logfile_maxbytes=0 +stderr_logfile=/dev/stderr +stderr_logfile_maxbytes=0 diff --git a/macaw/docker/qa_app/Dockerfile b/macaw/docker/qa_app/Dockerfile new file mode 100644 index 0000000..dce887d --- /dev/null +++ b/macaw/docker/qa_app/Dockerfile @@ -0,0 +1,38 @@ +FROM python:3.8.13 + +RUN apt-get update && apt-get install -y \ + nginx \ + supervisor \ + gcc \ + g++ \ + curl \ + python3-pip \ + vim + +# update pip +RUN pip3 install pip --upgrade + +# Setup flask application +RUN mkdir -p /deploy/app +# copy and install requirements before so that other changes do not require +# downloading requirements again. +COPY app/requirements.txt /deploy/app/requirements.txt +RUN pip3 install -r /deploy/app/requirements.txt +COPY app /deploy/app + +# Setup nginx +RUN rm /etc/nginx/sites-enabled/default +COPY config/flask.conf /etc/nginx/sites-available/ +RUN ln -s /etc/nginx/sites-available/flask.conf /etc/nginx/sites-enabled/flask.conf +RUN echo "daemon off;" >> /etc/nginx/nginx.conf + +RUN ln -s /usr/local/bin/gunicorn /usr/bin/gunicorn +# Setup supervisord +RUN mkdir -p /var/log/supervisor +COPY config/supervisord.conf /etc/supervisor/conf.d/supervisord.conf +#COPY config/gunicorn.conf /etc/supervisor/conf.d/gunicorn.conf + +EXPOSE 80 + +# Start processes +CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"] diff --git a/macaw/docker/qa_app/app/app.py b/macaw/docker/qa_app/app/app.py new file mode 100644 index 0000000..eb48d75 --- /dev/null +++ b/macaw/docker/qa_app/app/app.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import os +import time +import logging +import sys +from typing import Optional + +from flask import Flask, request, Response +from flask_restful import reqparse, Api, Resource + +import remote_module + +app = Flask("remote module") +api = Api(app) + +app.logger.addHandler(logging.StreamHandler(sys.stdout)) +app.logger.setLevel(logging.DEBUG) + + +class RemoteModule(Resource): + + def get(self): + return 200 + + def post(self): + t0 = time.time() + + args = request.get_json(force=True) + print(f"post request arguments {args}") + validation = self.__validate_input(args) + if validation: + return validation, 500 + + ret = {} + ret.update(self.__get_response(args)) + ret["performance"] = time.time() - t0 + ret["error"] = False + return ret, 200 + + @staticmethod + def __validate_input(args: dict) -> Optional[dict]: + message = "" + for ctx in remote_module.get_required_context(): + if not args.get(ctx): + message = "Context missing: " + ctx + if message: + return { + "message": message, + "error": True + } + return None + + @staticmethod + def __get_response(msg) -> dict: + response = remote_module.handle_message(msg) + app.logger.info("remote model result: %s", response) + return response + + +api.add_resource(RemoteModule, '/') + +if __name__ == '__main__': + app.run(host="127.0.0.1", port=os.environ.get('REMOTE_MODULE_PORT') or 8003) diff --git a/macaw/docker/qa_app/app/remote_module.py b/macaw/docker/qa_app/app/remote_module.py new file mode 100644 index 0000000..af29ba8 --- /dev/null +++ b/macaw/docker/qa_app/app/remote_module.py @@ -0,0 +1,15 @@ + +required_context = ["text"] + + +def get_required_context(): + return required_context + + +def handle_message(msg: dict) -> dict: + """ + Returns the response dictionary. Throws appropriate error if the model cannot generate a response. + """ + return { + "response": "okay response from qa model." + } diff --git a/macaw/docker/qa_app/app/requirements.txt b/macaw/docker/qa_app/app/requirements.txt new file mode 100644 index 0000000..6ad266d --- /dev/null +++ b/macaw/docker/qa_app/app/requirements.txt @@ -0,0 +1,15 @@ +Flask==2.1.2 +Flask-RESTful==0.3.9 +Werkzeug +future +jsonpickle +numpy +pandoc +six +typing +ConfigArgParse +gunicorn +transformers==4.11.2 +itsdangerous==2.0.1 +torch +boto3 diff --git a/macaw/docker/qa_app/config/flask.conf b/macaw/docker/qa_app/config/flask.conf new file mode 100644 index 0000000..3b68b1c --- /dev/null +++ b/macaw/docker/qa_app/config/flask.conf @@ -0,0 +1,9 @@ +server { + listen 80; + + location / { + proxy_pass http://localhost:5001/; + proxy_set_header Host $host; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + } +} diff --git a/macaw/docker/qa_app/config/gunicorn.conf b/macaw/docker/qa_app/config/gunicorn.conf new file mode 100644 index 0000000..bfecb41 --- /dev/null +++ b/macaw/docker/qa_app/config/gunicorn.conf @@ -0,0 +1,3 @@ +[program:gunicorn] +command=/usr/bin/gunicorn app:app -b localhost:5001 +directory=/deploy/app diff --git a/macaw/docker/qa_app/config/supervisord.conf b/macaw/docker/qa_app/config/supervisord.conf new file mode 100644 index 0000000..a32f072 --- /dev/null +++ b/macaw/docker/qa_app/config/supervisord.conf @@ -0,0 +1,18 @@ +[supervisord] +nodaemon=true +user=root + +[program:nginx] +command=/usr/sbin/nginx +stdout_logfile=/dev/stdout +stdout_logfile_maxbytes=0 +stderr_logfile=/dev/stderr +stderr_logfile_maxbytes=0 + +[program:gunicorn] +command=/usr/bin/gunicorn app:app -w 1 -b localhost:5001 --log-level info +directory=/deploy/app +stdout_logfile=/dev/stdout +stdout_logfile_maxbytes=0 +stderr_logfile=/dev/stderr +stderr_logfile_maxbytes=0 diff --git a/macaw/interface/fileio.py b/macaw/interface/fileio.py index 5ef4c53..33bd4e4 100644 --- a/macaw/interface/fileio.py +++ b/macaw/interface/fileio.py @@ -68,7 +68,8 @@ def result_presentation(self, output_msg, params): "Therefore, the message type should be options." ) elif self.params["output_format"] == "text": - if output_msg.msg_info["msg_type"] == "text": + msg_type = output_msg.msg_info["msg_type"] + if msg_type == "text": output_file.write( qid + "\t" @@ -77,7 +78,7 @@ def result_presentation(self, output_msg, params): ) else: raise Exception( - "text output format is only recognized for text outputs." + f"Text output format is only recognized for text outputs. Found {msg_type}." ) else: raise Exception("Unknown output file format!") diff --git a/macaw/interface/stdio.py b/macaw/interface/stdio.py index cc91eb6..a59ffa6 100644 --- a/macaw/interface/stdio.py +++ b/macaw/interface/stdio.py @@ -18,6 +18,7 @@ def __init__(self, params): self.msg_id = int(time.time()) def run(self): + print(f"Inside StdioInterface run method.") while True: try: request = input("ENTER YOUR COMMAND (type 'exit' to leave): ").strip() diff --git a/macaw/live_main.py b/macaw/live_main.py index 628f6ad..b50af55 100644 --- a/macaw/live_main.py +++ b/macaw/live_main.py @@ -7,9 +7,10 @@ from typing import List from core.interaction_handler import Message +from core.response.punctuation import ResponseGeneratorPunctuation from macaw.cis import CIS from macaw.core import mrc, retrieval -from macaw.util.logging import Logger +from macaw.util.custom_logging import LoggerFactory class ConvQA(CIS): @@ -25,7 +26,6 @@ def __init__(self, params): for more information on the required parameters. """ super().__init__(params) - self.logger = params["logger"] self.logger.info("Conversational QA Model... starting up...") def generate_actions(self) -> dict: @@ -36,35 +36,43 @@ def generate_actions(self) -> dict: def request_handler_func(self, conv_list: List[Message]) -> Message: """ - This function is called for each conversational interaction made by the user. In fact, this function calls the - dispatcher to send the user request to the information seeking components. + This function is called for each conversational interaction made by the user. It calls the NLP pipeline, DST + model and all the actions and saves their result in the latest conversation message. Args: - conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the + conv_list(list): List of Message, each corresponding to a conversational message from / to the user. This list is in reverse order, meaning that the first elements is the last interaction made by user. Returns: output_msg(Message): Returns an output message that should be sent to the UI to be presented to the user. + It is the latest conversation message. """ self.logger.info(conv_list) self.nlp_pipeline.run(conv_list) + self.logger.info(f"nlp pipeline result: {conv_list[0].nlp_pipeline_result}") - # Run the DST module here. Save the output locally. - nlp_pipeline_output = None + # Run the DST module and save the output in conversation. + nlp_pipeline_output = conv_list[0].nlp_pipeline_result self.dialogue_manager.process_turn(nlp_pipeline_output) + conv_list[0].dialog_manager = self.dialogue_manager - # TODO: request dispatcher should also use DST output. dispatcher_output = self.request_dispatcher.dispatch(conv_list) output_msg = self.output_selection.get_output(conv_list, dispatcher_output) - # Save nlp_pipeline result, action result and dialogue_manager so that they are persisted in DB. - output_msg.nlp_pipeline_result = nlp_pipeline_output - output_msg.actions_result = None - output_msg.dialog_manager = self.dialogue_manager + # Run response generator. + models_to_run = self.response_generator_handler.models_selector(conv_list) + rg_output = self.response_generator_handler.run_models(models_to_run, conv_list) + self.logger.info(f"rg_output: {rg_output}") - return output_msg + # TODO: Select or create response from RG output. + + conv_list[0].msg_info = output_msg.msg_info + conv_list[0].response = output_msg.response + conv_list[0].timestamp = output_msg.timestamp + + return conv_list[0] def run(self): """ @@ -84,8 +92,10 @@ def run(self): basic_params = { "timeout": 15, # timeout is in terms of second. "mode": args.mode, # mode can be either live or exp. - "logger": Logger({}), - } # for logging into file, pass the filepath to the Logger class. + } + + # for logging into file, pass the filepath to the Logger class. + my_logger = LoggerFactory.create_logger({}) # These are required database parameters if the mode is 'live'. The host and port of the machine hosting the # database, as well as the database name. @@ -134,12 +144,26 @@ def run(self): "qa_results_requested": 3, } # The number of candidate answers returned by the MRC model. + # ML modules which are part of the NLP pipeline and all response generators. + ml_modules = { + "nlp_models": { + "intent_classification": "http://nlp-pipeline-app-ic:80", + "sample_flask": "http://nlp-pipeline-app-flask:80", + }, + "response_generator_models": { + "qa": "http://response-generator-app-qa:80", + "punctuation": ResponseGeneratorPunctuation(name="punctuation_model") + } + } + params = { **basic_params, **db_params, **interface_params, **retrieval_params, **mrc_params, + **ml_modules } - basic_params["logger"].info(params) + + my_logger.info(params) ConvQA(params).run() diff --git a/macaw/util/__init__.py b/macaw/util/__init__.py index d3c494b..07aaaf4 100644 --- a/macaw/util/__init__.py +++ b/macaw/util/__init__.py @@ -6,10 +6,14 @@ import json +import backoff as backoff from stanfordcorenlp import StanfordCoreNLP class NLPUtil: + @backoff.on_exception(backoff.expo, + Exception, + max_tries=3) def __init__(self, params): """ A simple NLP helper class. diff --git a/macaw/util/custom_logging.py b/macaw/util/custom_logging.py new file mode 100644 index 0000000..7ac0da7 --- /dev/null +++ b/macaw/util/custom_logging.py @@ -0,0 +1,28 @@ +""" +The internal logger. + +Authors: Hamed Zamani (hazamani@microsoft.com) +""" + +import logging + + +class LoggerFactory: + @staticmethod + def create_logger(params) -> logging.Logger: + """ + Creates a new custom logger with configuration passed in params dictionary. Supported options are + 'logger_name', 'logger_level', and 'logger_file'. + """ + logger = logging.getLogger(params.get("logger_name", "MacawLogger")) + logger.setLevel(params.get("logger_level", logging.DEBUG)) + + if "logger_file" in params: + handler = logging.FileHandler(params["logger_file"]) + else: + handler = logging.StreamHandler() + + formatter = logging.Formatter('%(name)s - %(asctime)s - %(levelname)s - %(message)s') + handler.setFormatter(formatter) + logger.addHandler(handler) + return logger diff --git a/macaw/util/logging.py b/macaw/util/logging.py deleted file mode 100644 index 156ae77..0000000 --- a/macaw/util/logging.py +++ /dev/null @@ -1,28 +0,0 @@ -""" -The internal logger. - -Authors: Hamed Zamani (hazamani@microsoft.com) -""" - -import logging - - -class Logger(logging.Logger): - def __init__(self, params): - """ - A simple logging class, inherited from the standard logging.Logger. - - Args: - params(dict): A dict containing some parameters. 'logging_file' is an optional parameter, otherwise STDIO - will be used for logging. - """ - super().__init__('Macaw Logger') - self.params = params - if 'logging_file' in params: - self.handler_ = logging.FileHandler(params['logging_file']) - else: - self.handler_ = logging.StreamHandler() - - self.format = logging.Formatter('%(name)s - %(asctime)s - %(levelname)s - %(message)s') - self.handler_.setFormatter(self.format) - self.addHandler(self.handler_) diff --git a/macaw/wizard_of_oz_main.py b/macaw/wizard_of_oz_main.py index 08331d5..7324bfb 100644 --- a/macaw/wizard_of_oz_main.py +++ b/macaw/wizard_of_oz_main.py @@ -3,15 +3,15 @@ Authors: Hamed Zamani (hazamani@microsoft.com) """ - +import logging import multiprocessing from macaw import interface from macaw.core import retrieval -from macaw.core.input_handler.action_detection import RequestDispatcher +from macaw.core.response.action_detection import RequestDispatcher from macaw.core.interaction_handler.user_requests_db import InteractionDB from macaw.core.output_handler import naive_output_selection -from macaw.util.logging import Logger +from macaw.util.custom_logging import LoggerFactory class Seeker: @@ -27,7 +27,7 @@ def __init__(self, params): for more information on the required parameters. """ self.params = params - self.logger = params['logger'] + self.logger = logging.getLogger("MacawLogger") self.logger.info('Conversational Wirzard of Oz System... starting up...') self.wizard = None self.params['live_request_handler'] = self.live_request_handler @@ -87,7 +87,7 @@ def __init__(self, params): for more information on the required parameters. """ self.params = params - self.logger = params['logger'] + self.logger = logging.getLogger("MacawLogger") self.logger.info('Conversational Wirzard of Oz System... starting up...') self.params['live_request_handler'] = self.live_request_handler self.seeker = None @@ -148,9 +148,9 @@ def run(self): if __name__ == '__main__': + my_logger = LoggerFactory.create_logger({}) basic_params = {'timeout': 15, # timeout is in terms of second. - 'mode': 'live', # mode can be either live or exp. - 'logger': Logger({})} # for logging into file, pass the filepath to the Logger class. + 'mode': 'live'} # mode can be either live or exp. # These are required database parameters if the mode is 'live'. The host and port of the machine hosting the # database, as well as the database name. @@ -190,8 +190,8 @@ def run(self): seeker_params = {**basic_params, **db_params, **seeker_interface_params, **retrieval_params} wizard_params = {**basic_params, **db_params, **wizard_interface_params, **retrieval_params} - basic_params['logger'].info(seeker_params) - basic_params['logger'].info(wizard_params) + my_logger.info(seeker_params) + my_logger.info(wizard_params) seeker = Seeker(seeker_params) wizard = Wizard(wizard_params) @@ -204,6 +204,6 @@ def run(self): seeker_process.start() wizard_process.start() - basic_params['logger'].info('Seeker Process ID: {}'.format(seeker_process.pid)) - basic_params['logger'].info('Wizard Process ID: {}'.format(wizard_process.pid)) + my_logger.info('Seeker Process ID: {}'.format(seeker_process.pid)) + my_logger.info('Wizard Process ID: {}'.format(wizard_process.pid)) diff --git a/requirements.txt b/requirements.txt index bf68383..856df75 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,3 +8,5 @@ stanfordcorenlp google-cloud-texttospeech tantivy pymongo +backoff +deepmultilingualpunctuation diff --git a/scripts/start.sh b/scripts/start.sh index 9f2b727..a9f61dd 100644 --- a/scripts/start.sh +++ b/scripts/start.sh @@ -9,4 +9,6 @@ echo "MongoDB started successfully." echo "This is the python version: $(python3 --version)" # Start the main application. -python3 macaw/live_main.py --mode live --interface stdio +python3 macaw/live_main.py --mode exp --interface fileio +# Use the below command for terminal IO. +#python3 macaw/live_main.py --mode live --interface stdio diff --git a/setup.py b/setup.py index 8675bdc..c9a82ac 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ setup( name='macaw', version='0.1', - packages=['macaw', 'macaw.core', 'macaw.core.mrc', 'macaw.core.retrieval', 'macaw.core.input_handler', + packages=['macaw', 'macaw.core', 'macaw.core.mrc', 'macaw.core.retrieval', 'macaw.core.response', 'macaw.core.output_handler', 'macaw.core.interaction_handler', 'macaw.util', 'macaw.interface'], url='https://github.com/microsoft/macaw/', license='MIT', From b2b57c9e0ea8bf888ad59f1d3c9990f71d44fe9a Mon Sep 17 00:00:00 2001 From: Amit Hattimare Date: Sat, 9 Jul 2022 21:57:05 -0700 Subject: [PATCH 10/10] Saving complete output of fileio and stdio in a verbose output file. (#9) This type of output is helpful for analysis purposes. Co-authored-by: Amit Hattimare --- docker-compose.yml | 8 ++++ macaw/core/interaction_handler/msg.py | 2 + macaw/interface/fileio.py | 45 +++++++++++++------- macaw/interface/interface.py | 2 +- macaw/interface/stdio.py | 60 ++++++++++++++------------- macaw/interface/telegram.py | 4 +- macaw/live_main.py | 6 ++- 7 files changed, 81 insertions(+), 46 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index b4a9c6b..db68cec 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -16,6 +16,10 @@ services: build: ./macaw/docker/flask_app ports: - "127.0.0.1:8001:80" + deploy: + resources: + limits: + memory: 100mb nlp-pipeline-app-ic: build: ./macaw/docker/ic_app ports: @@ -24,3 +28,7 @@ services: build: ./macaw/docker/qa_app ports: - "127.0.0.1:8003:80" + deploy: + resources: + limits: + memory: 100mb diff --git a/macaw/core/interaction_handler/msg.py b/macaw/core/interaction_handler/msg.py index cb851c9..a186887 100644 --- a/macaw/core/interaction_handler/msg.py +++ b/macaw/core/interaction_handler/msg.py @@ -83,4 +83,6 @@ def __iter__(self): value_ret = value.__dict__ elif isinstance(value, DialogManager): value_ret = value.encode() + elif isinstance(value, datetime): + value_ret = str(value) yield attr, value_ret diff --git a/macaw/interface/fileio.py b/macaw/interface/fileio.py index 33bd4e4..16e90dc 100644 --- a/macaw/interface/fileio.py +++ b/macaw/interface/fileio.py @@ -3,7 +3,7 @@ Authors: Hamed Zamani (hazamani@microsoft.com) """ - +import json import time from datetime import datetime @@ -17,8 +17,9 @@ def __init__(self, params): self.msg_id = int(time.time()) def run(self): - output_file = open(self.params["output_file_path"], "w+") - with open(self.params["input_file_path"]) as input_file: + with open(self.params["output_file_path"], "w+") as output_file_handler, \ + open(self.params["verbose_output_file_path"], "w+") as verbose_file_handler, \ + open(self.params["input_file_path"]) as input_file: for line in input_file: str_list = line.strip().split("\t") if len(str_list) != 2: @@ -40,19 +41,35 @@ def run(self): ) output_msg = self.params["experimental_request_handler"](msg) self.result_presentation( - output_msg, {"output_file": output_file, "qid": qid} + output_msg, + { + "qid": qid, + "output_file_handler": output_file_handler, + "verbose_file_handler": verbose_file_handler + } ) - output_file.close() - def result_presentation(self, output_msg, params): - qid = params["qid"] - output_file = params["output_file"] + def result_presentation(self, response_msg: Message, additional_params: dict): + """ + This method writes the result of this turn into output files. Files are already opened before calling this + method and handlers passed in as arguments. + + response_msg: the result message generated for this turn. + additional_params: dict having params required by this method not present in the self.params class variable. + """ + qid = additional_params["qid"] + output_fh = additional_params["output_file_handler"] + verbose_fh = additional_params["verbose_file_handler"] + + verbose_fh.write(json.dumps(dict(response_msg))) + verbose_fh.write("\n") + if self.params["output_format"] == "trec": - if output_msg.msg_info["msg_type"] == "options": + if response_msg.msg_info["msg_type"] == "options": for (i, (option_name, option_id, output_score)) in enumerate( - output_msg.msg_info["options"] + response_msg.msg_info["options"] ): - output_file.write( + output_fh.write( qid + "\tQ0\t" + option_name @@ -68,12 +85,12 @@ def result_presentation(self, output_msg, params): "Therefore, the message type should be options." ) elif self.params["output_format"] == "text": - msg_type = output_msg.msg_info["msg_type"] + msg_type = response_msg.msg_info["msg_type"] if msg_type == "text": - output_file.write( + output_fh.write( qid + "\t" - + output_msg.response.replace("\n", " ").replace("\t", " ") + + response_msg.response.replace("\n", " ").replace("\t", " ") + "\n" ) else: diff --git a/macaw/interface/interface.py b/macaw/interface/interface.py index f867bc4..e2638be 100644 --- a/macaw/interface/interface.py +++ b/macaw/interface/interface.py @@ -18,5 +18,5 @@ def run(self): pass @abstractmethod - def result_presentation(self, response_msg: Message, params: dict): + def result_presentation(self, response_msg: Message, additional_params: dict): pass diff --git a/macaw/interface/stdio.py b/macaw/interface/stdio.py index a59ffa6..149dfb2 100644 --- a/macaw/interface/stdio.py +++ b/macaw/interface/stdio.py @@ -3,7 +3,7 @@ Authors: Hamed Zamani (hazamani@microsoft.com), George Wei (gzwei@umass.edu) """ - +import json import time import traceback from datetime import datetime @@ -19,34 +19,38 @@ def __init__(self, params): def run(self): print(f"Inside StdioInterface run method.") - while True: - try: - request = input("ENTER YOUR COMMAND (type 'exit' to leave): ").strip() - if request == "exit": - break - if len(request) == 0: - continue - user_info = {"first_name": "STDIO", "is_bot": "False"} - msg_info = { - "msg_id": self.msg_id, - "msg_type": "command" if request.startswith("#") else "text", - "msg_source": "user", - } - self.msg_id += 1 - msg = Message( - user_interface="stdio", - user_id=-1, - text=request, - timestamp=datetime.utcnow(), - user_info=user_info, - msg_info=msg_info, - ) - output = self.params["live_request_handler"](msg) - self.result_presentation(output, {}) - except Exception as ex: - traceback.print_exc() + with open(self.params["verbose_output_file_path"], "w+") as verbose_file: + while True: + try: + request = input("ENTER YOUR COMMAND (type 'exit' to leave): ").strip() + if request == "exit": + break + if len(request) == 0: + continue + user_info = {"first_name": "STDIO", "is_bot": "False"} + msg_info = { + "msg_id": self.msg_id, + "msg_type": "command" if request.startswith("#") else "text", + "msg_source": "user", + } + self.msg_id += 1 + msg = Message( + user_interface="stdio", + user_id=-1, + text=request, + timestamp=datetime.utcnow(), + user_info=user_info, + msg_info=msg_info, + ) + output = self.params["live_request_handler"](msg) + self.result_presentation(output, {"verbose_file_handler": verbose_file}) + except Exception as ex: + traceback.print_exc() + + def result_presentation(self, response_msg: Message, additional_params: dict): + verbose_fh = additional_params["verbose_file_handler"] + verbose_fh.write(json.dumps(dict(response_msg)) + "\n") - def result_presentation(self, response_msg: Message, params: dict): try: print("THE RESPONSE STARTS") print( diff --git a/macaw/interface/telegram.py b/macaw/interface/telegram.py index 08b19a8..d0951c8 100644 --- a/macaw/interface/telegram.py +++ b/macaw/interface/telegram.py @@ -159,12 +159,12 @@ def button_click_handler(self, update, context): except Exception as ex: traceback.print_exc() - def result_presentation(self, response_msg, params): + def result_presentation(self, response_msg, additional_params): """This method produces an appropriate response to be sent to the client.""" try: if response_msg is None: return - update = params["update"] + update = additional_params["update"] if response_msg.msg_info["msg_type"] == "text": if update.message is not None: update.message.reply_text(response_msg.text[: self.MAX_MSG_LEN]) diff --git a/macaw/live_main.py b/macaw/live_main.py index b50af55..778a863 100644 --- a/macaw/live_main.py +++ b/macaw/live_main.py @@ -71,6 +71,7 @@ def request_handler_func(self, conv_list: List[Message]) -> Message: conv_list[0].msg_info = output_msg.msg_info conv_list[0].response = output_msg.response conv_list[0].timestamp = output_msg.timestamp + conv_list[0].actions_result = rg_output return conv_list[0] @@ -108,9 +109,12 @@ def run(self): # These are interface parameters. They are interface specific. interface_params = { "interface": args.interface, # interface can be 'telegram' or 'stdio' for live mode, and 'fileio' - # for experimental mode. + + # for experimental mode with fileio interface. "input_file_path": "/usr/src/app/data/file_input.txt", "output_file_path": "/usr/src/app/data/file_output.txt", + + "verbose_output_file_path": "/usr/src/app/data/file_output_verbose.txt", "output_format": "text", "bot_token": "YOUR_TELEGRAM_BOT_TOKEN", # Telegram bot token. # 'asr_model': 'google', # The API used for speech recognition.