diff --git a/.gitignore b/.gitignore
index 9fc011c..a304e31 100644
--- a/.gitignore
+++ b/.gitignore
@@ -22,3 +22,8 @@ reports/
 \#*\#
 *.egg-info
 .idea/
+
+
+# Visual Studio Code
+.vscode/*.log
+*.code-workspace
diff --git a/.prospector.yaml b/.prospector.yaml
new file mode 100755
index 0000000..8c73e9f
--- /dev/null
+++ b/.prospector.yaml
@@ -0,0 +1,62 @@
+mccabe:
+  disable:
+    - MC0001
+
+pep8:
+  disable:
+    - E305
+    - E306
+    - E115
+    - E116
+    - E501
+    - E722
+    - E741
+
+pycodestyle:
+  disable:
+    - E115
+    - E116
+    - E305
+    - E306
+    - E501
+    - E722
+    - E741
+
+pyflakes:
+  disable:
+    - F401
+    - F821
+    - F841
+
+pylint:
+  disable:
+    - arguments-renamed
+    - bare-except
+    - consider-using-f-string
+    - consider-using-with
+    - deprecated-module
+    - django-not-configured
+    - import-error
+    - import-outside-toplevel
+    - inconsistent-return-statements
+    - line-too-long
+    - logging-format-interpolation
+    - logging-not-lazy
+    - method-hidden
+    - multiple-imports
+    - no-else-raise
+    - no-else-return
+    - pointless-statement
+    - super-with-arguments
+    - too-many-arguments
+    - too-many-branches
+    - too-many-locals
+    - too-many-statements
+    - undefined-variable
+    - unidiomatic-typecheck
+    - unused-argument
+    - unused-import
+    - unused-variable
+    - unspecified-encoding
+    - useless-object-inheritance
+    - useless-suppression
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100755
index 0000000..5c0578a
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,91 @@
+{
+  "editor.rulers": [79],
+
+  "files.exclude": {
+    "**/*.egg-info": true,
+    "**/.git": true,
+    "**/.mypy_cache": true,
+    "**/*.pyc": {"when": "$(basename).py"},
+    "**/__pycache__": true,
+    "**/.ropeproject": true
+  },
+
+  "python.analysis.diagnosticSeverityOverrides": {
+    "reportMissingImports" : "none",
+    "reportMissingModuleSource" : "none",
+    "reportUndefinedVariable" : "none"
+  },
+
+  "python.linting.enabled": true,
+
+  "python.linting.pylintEnabled": true,
+  "python.linting.pylintArgs": [
+    "--disable", "arguments-renamed",
+    "--disable", "bare-except",
+    "--disable", "broad-except",
+    "--disable", "c-extension-no-member",
+    "--disable", "consider-using-f-string",
+    "--disable", "consider-using-with",
+    "--disable", "deprecated-module",
+    "--disable", "fixme",
+    "--disable", "import-error",
+    "--disable", "import-outside-toplevel",
+    "--disable", "inconsistent-return-statements",
+    "--disable", "invalid-name",
+    "--disable", "line-too-long",
+    "--disable", "logging-format-interpolation",
+    "--disable", "logging-not-lazy",
+    "--disable", "method-hidden",
+    "--disable", "missing-class-docstring",
+    "--disable", "missing-function-docstring",
+    "--disable", "missing-module-docstring",
+    "--disable", "multiple-imports",
+    "--disable", "no-else-raise",
+    "--disable", "no-else-return",
+    "--disable", "no-self-use",
+    "--disable", "pointless-statement",
+    "--disable", "super-with-arguments",
+    "--disable", "too-few-public-methods",
+    "--disable", "too-many-arguments",
+    "--disable", "too-many-branches",
+    "--disable", "too-many-instance-attributes",
+    "--disable", "too-many-locals",
+    "--disable", "too-many-return-statements",
+    "--disable", "too-many-statements",
+    "--disable", "undefined-variable",
+    "--disable", "unidiomatic-typecheck",
+    "--disable", "unnecessary-pass",
+    "--disable", "unspecified-encoding",
+    "--disable", "unused-argument",
+    "--disable", "unused-import",
+    "--disable", "unused-variable",
+    "--disable", "useless-object-inheritance",
+    "--disable", "wrong-import-order"
+  ],
+
+  "python.linting.flake8Enabled": true,
+  "python.linting.flake8Args": [
+    "--ignore=E115,E116,E123,E128,E226,E231,E261,E265,E266,E302,E303,E305,E306,E401,E501,E722,E741,F401,F821,F841,N806"
+  ],
+
+  "python.linting.mypyEnabled": false,
+
+  "python.linting.pydocstyleEnabled": false,
+
+  "python.linting.pycodestyleEnabled": true,
+  "python.linting.pycodestyleArgs": [
+    "--ignore=E115,E116,E123,E128,E226,E261,E265,E231,E266,E302,E303,E305,E306,E401,E501,E722,E741"
+  ],
+
+  "python.linting.prospectorEnabled": true,
+
+  "python.linting.pylamaEnabled": true,
+  "python.linting.pylamaArgs": [
+    "--ignore=C901,E115,E116,E123,E128,E226,E231,E261,E265,E266,E302,E303,E305,E306,E401,E501,E0602,E722,E741,W0611,W0612"
+  ],
+
+  "python.linting.banditEnabled": true,
+  "python.linting.banditArgs": [
+    "--skip=B103,B108,B110,B311"
+  ]
+}
diff --git a/README.md b/README.md
index 9a8b574..e6874db 100644
--- a/README.md
+++ b/README.md
@@ -3,58 +3,153 @@ xqueue_watcher
 
 This is an implementation of a polling [XQueue](https://github.com/edx/xqueue) client and grader.
 
+Overview
+========
 
-Running
-=======
-
-`python -m xqueue_watcher -d [path to settings directory]`
-
-
-JSON configuration file
-=======================
-	{
-		"test-123": {
-			"SERVER": "http://127.0.0.1:18040",
-			"CONNECTIONS": 1,
-			"AUTH": ["lms", "lms"],
-			"HANDLERS": [
-				{
-					"HANDLER": "xqueue_watcher.grader.Grader",
-					"KWARGS": {
-						"grader_root": "/path/to/course/graders/",
-					}
-				}
-			]
-		}
-	}
+There are several components in a working XQueue Watcher service:
+- **XQueue Watcher**: it polls an xqueue service continually for new submissions and grades them.
+- **Submissions Handler**: when the watcher finds any new submission, it will be passed to the handler for grading. It is a generic handler that can be configured to work with different submissions through individual submission graders.
+- **Individual Submission Grader**: each exercise or homework may specify its own "grader". This should map to a file on the server that usually specifies test cases or additional processing for the student submission.
+
+Usually your server will look like this:
+```
+root/
+├── xqueue-watcher/
+│   ├── ... # xqueue-watcher repo, unchanged
+│   └── ...
+├── config/
+│   └── conf.d/
+│   │   └── my-course.json
+│   └── logging.json
+└── my-course/
+   ├── exercise1/
+   │   ├── grader.py  # - per-exercise grader
+   │   └── answer.py  # - if using JailedGrader
+   ├── ...
+   └── exercise2/
+       ├── grader.py
+       └── answer.py
+```
+Running XQueue Watcher:
+======================
+
+Usually you can run XQueue Watcher without making any changes. You should keep course-specific in another folder like shown above, so that you can update xqueue_watcher anytime.
+
+Install the requirements before running `xqueue_watcher`
+```bash
+cd xqueue-watcher/
+make requirements
+```
+
+Now you're ready to run it.
+```bash
+python -m xqueue_watcher -d [path to the config directory, eg ../config]
+```
+
+The course configuration JSON file in `conf.d` should have the following structure:
+```json
+    {
+        "test-123": {
+            "SERVER": "http://127.0.0.1:18040",
+            "CONNECTIONS": 1,
+            "AUTH": ["lms", "lms"],
+            "HANDLERS": [
+                {
+                    "HANDLER": "xqueue_watcher.grader.Grader",
+                    "KWARGS": {
+                        "grader_root": "/path/to/course/graders/",
+                    }
+                }
+            ]
+        }
+    }
+```
 
 * `test-123`: the name of the queue
 * `SERVER`: XQueue server address
 * `AUTH`: list of username, password
 * `CONNECTIONS`: how many threads to spawn to watch the queue
 * `HANDLERS`: list of callables that will be called for each queue submission
-	* `HANDLER`: callable name
-	* `KWARGS`: optional keyword arguments to apply during instantiation
+   * `HANDLER`: callable name, see below for Submissions Handler
+   * `KWARGS`: optional keyword arguments to apply during instantiation
+      * `grader_root`: path to the course directory, eg /path/to/my-course
 
+> TODO: document logging.json
 
-xqueue_watcher.grader.Grader
-========================
+Submissions Handler
+===================
+
+When xqueue_watcher detects any new submission, it will be passed to the submission handler for grading. It will instantiate a new handler based on the name configured above, with submission information retrieved
+from XQueue. There is a base grader defined in xqueue_watcher: Grader and JailedGrader (for Python, using CodeJail). If you don't use JailedGrader, you'd have to implement your own Grader by subclassing `xqueue_watcher.grader.Grader
+
+The payload from XQueue will be a JSON that usually looks like this, notice that "grader" is a required field in the "grader_payload" and must be configured accordingly in the Studio for the exercise.
+```json
+{
+    "student_info": {
+        "random_seed": 1,
+        "submission_time": "20210109222647",
+        "anonymous_student_id": "6d07814a4ece5cdda54af1558a6dfec0"
+    },
+    "grader_payload": "\n        {\"grader\": \"relative/path/to/grader.py\"}\n      ",
+    "student_response": "print \"hello\"\r\n      "
+}
+```
+
+## Custom Handler
 To implement a pull grader:
 
-Subclass xqueue_watcher.grader.Grader and override the `grade` method. Then add your grader to the config like `"handler": "my_module.MyGrader"`. The arguments for the `grade` method are:
-	* `grader_path`: absolute path to the grader defined for the current problem
-	* `grader_config`: other configuration particular to the problem
-	* `student_response`: student-supplied code
+Subclass `xqueue_watcher.grader.Grader` and override the `grade` method. Then add your grader to the config like `"handler": "my_module.MyGrader"`. The arguments for the `grade` method are:
+   * `grader_path`: absolute path to the grader defined for the current problem.
+   * `grader_config`: other configuration particular to the problem
+   * `student_response`: student-supplied code
 
+Note that `grader_path` is constructed by appending the relative path to the grader from `grader_payload` to the `grader_root` in the configuration JSON. If the handler cannot find a `grader.py` file, it would fail to grade the submission.
 
-Sandboxing
-==========
-To sandbox python, use [CodeJail](https://github.com/edx/codejail). In your handler configuration, add:
+## Grading Python submissions with JailedGrader
 
-	"CODEJAIL": {
-		"name": "python",
-		"python_bin": "/path/to/sandbox/python",
-		"user": "sandbox_username"
-	}
+`xqueue_watcher` provides a few utilities for grading python submissions, including JailedGrader for running python code in a safe environment and grading support utilities.
 
+### JailedGrader
+To sandbox python, use [CodeJail](https://github.com/edx/codejail). In your handler configuration, add:
+```json
+    "HANDLER": "xqueue_watcher.jailedgrader.JailedGrader",
+    "CODEJAIL": {
+        "name": "python",
+        "python_bin": "/path/to/sandbox/python",
+        "user": "sandbox_username"
+    }
+```
 Then, `codejail_python` will automatically be added to the kwargs for your handler. You can then import codejail.jail_code and run `jail_code("python", code...)`. You can define multiple sandboxes and use them as in `jail_code("special-python", ...)`
+
+To use JailedGrader, you also need to provide an `answer.py` file on the same folder with the `grader.py` file. The grader will run both student submission and `answer.py` and compare the output with each other.
+
+### Grading Support utilities
+There are several grading support utilities that make writing `grader.py` for python code easy. Check out
+`grader_support/gradelib.py` for the documentation.
+
+- `grader_support.gradelib.Grader`: a base class for creating a new submission grader. Not to be confused with `xqueue-watcher.grader.Grader`. You can add input checks, preprocessors and tests to a grader object.
+- `grader_support.gradelib.Test`: a base class for creating tests for a submission. Usually a submission can be graded with one or a few tests. There are also few useful test functions and classes included, like `InvokeStudentFunctionTest` , `exec_wrapped_code`, etc.
+- Preprocessors: utilities to process the raw submission before grading it. `wrap_in_string` is useful for testing code that is not wrapped in a function.
+- Input checks: sanity checks before running a submission, eg check `required_string` or `prohibited_string`
+
+Using the provided grader class, your `grader.py` would look something like this:
+```python
+from grader_support import gradelib
+grader = gradelib.Grader()
+
+# invoke student function foo with parameter []
+grader.add_test(gradelib.InvokeStudentFunctionTest('foo', []))
+```
+
+Or with a pre-processor:
+```python
+import gradelib
+
+grader = gradelib.Grader()
+
+# execute a raw student code & capture stdout
+grader.add_preprocessor(gradelib.wrap_in_string)
+grader.add_test(gradelib.ExecWrappedStudentCodeTest({}, "basic test"))
+```
+
+You can also write your own test class, processor and input checks.
\ No newline at end of file
diff --git a/grader_support/gradelib.py b/grader_support/gradelib.py
index 6f16721..bc6dd5d 100644
--- a/grader_support/gradelib.py
+++ b/grader_support/gradelib.py
@@ -69,6 +69,9 @@ def __init__(self):
         # list of functions: submission_text -> error text or None
         self._input_checks = []
 
+        # Flag: Do not run, just check input
+        self._only_check_input = False
+
         # list of functions: submission_text -> processed_submission_text.  Run
         # in the specified order.  (foldl)
         self._preprocessors = [fix_line_endings]
@@ -88,6 +91,12 @@ def input_errors(self, submission_str):
         """
         return [_f for _f in [check(submission_str) for check in self._input_checks] if _f]
 
+    def only_check_input(self):
+        return self._only_check_input
+
+    def set_only_check_input(self, value):
+        self._only_check_input = value
+
     def preprocess(self, submission_str):
         """
         submission: string
@@ -546,6 +555,17 @@ def __init__(self, fn_name, args, environment=None, output_writer=None, short_de
             short_desc = "Test: %s(%s)" % (fn_name, ", ".join(repr(a) for a in args))
         Test.__init__(self, test_fn, short_desc, detailed_desc, compare)
 
+class ExecWrappedStudentCodeTest(Test):
+    """
+    A Test that exec student code and capture the stdout result.
+    The code must be preprocessed with `wrap_in_string`
+    """
+    def __init__(self, environment=None, short_desc=None, detailed_desc=None, compare=None):
+        test_fn = exec_wrapped_code(environment)
+        if short_desc is None:
+            short_desc = "Test: %s(%s)" % (fn_name, ", ".join(repr(a) for a in args))
+        Test.__init__(self, test_fn, short_desc, detailed_desc, compare)
+
 def round_float_writer(n):
     """
     Returns an output_writer function that rounds its argument to `n` places.
diff --git a/requirements/production.txt b/requirements/production.txt
index eee07bc..9ce674a 100644
--- a/requirements/production.txt
+++ b/requirements/production.txt
@@ -4,7 +4,7 @@
 #
 #    make upgrade
 #
--e git+https://github.com/edx/codejail.git@4127fc4bd5775cc72aee8d7f0a70e31405e22439#egg=codejail  # via -r requirements/base.txt
+EdX-CodeJail >= 3.2.0
 backports.os==0.1.1       # via -r requirements/base.txt, path.py
 certifi==2020.6.20        # via -r requirements/base.txt, requests
 chardet==3.0.4            # via -r requirements/base.txt, requests
diff --git a/setup.py b/setup.py
index a4c14f6..6de6c1e 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,8 @@
     version='0.2',
     description='XQueue Pull Grader',
     packages=[
+        'grader_support',
         'xqueue_watcher',
     ],
-    install_requires=open('requirements/production.txt', 'rb').readlines()
+    install_requires=open('requirements/production.txt', 'r').readlines()
 )
diff --git a/xqueue_watcher/grader.py b/xqueue_watcher/grader.py
index d3b50d6..7b1a542 100644
--- a/xqueue_watcher/grader.py
+++ b/xqueue_watcher/grader.py
@@ -116,9 +116,13 @@ def process_item(self, content, queue=None):
             files = content['xqueue_files']
 
             # Delivery from the lms
+            print("____ DEBUG ____")
+            print(body)
             body = json.loads(body)
             student_response = body['student_response']
             payload = body['grader_payload']
+            print(student_response)
+            print(payload)
             try:
                 grader_config = json.loads(payload)
             except ValueError as err:
@@ -130,8 +134,12 @@ def process_item(self, content, queue=None):
                 raise
 
             self.log.debug("Processing submission, grader payload: {0}".format(payload))
+            #relative_grader_path = 'lesson1_hw2/grader.py' # TODO actually have a grader in the config
             relative_grader_path = grader_config['grader']
             grader_path = (self.grader_root / relative_grader_path).abspath()
+            print("___ DEBUG ___")
+            print("Grader path", grader_path)
+            print("Relative path", relative_grader_path)
             start = time.time()
             results = self.grade(grader_path, grader_config, student_response)
 
diff --git a/xqueue_watcher/jailedgrader.py b/xqueue_watcher/jailedgrader.py
index 8bab17f..be45b8a 100644
--- a/xqueue_watcher/jailedgrader.py
+++ b/xqueue_watcher/jailedgrader.py
@@ -10,6 +10,7 @@
 import gettext
 from path import Path
 import six
+import traceback
 
 import codejail
 
@@ -66,7 +67,7 @@ class JailedGrader(Grader):
     and optionally codejail_python="python name" (the name that you used to configure codejail)
     """
     def __init__(self, *args, **kwargs):
-        self.codejail_python = kwargs.pop("codejail_python", "python")
+        self.codejail_python = kwargs.pop("codejail_python", "python3")
         super(JailedGrader, self).__init__(*args, **kwargs)
         self.locale_dir = self.grader_root / "conf" / "locale"
         self.fork_per_item = False  # it's probably safe not to fork
@@ -78,14 +79,24 @@ def _enable_i18n(self, language):
         trans.install(names=None)
 
     def _run(self, grader_path, thecode, seed):
+        print("--- RUN CODE ---", grader_path, thecode)
         files = SUPPORT_FILES + [grader_path]
         if self.locale_dir.exists():
             files.append(self.locale_dir)
         extra_files = [('submission.py', thecode.encode('utf-8'))]
         argv = ["-m", "grader_support.run", Path(grader_path).basename(), 'submission.py', seed]
+        print("argv -- ", argv)
+        print("files", files)
+        print("extra_files", extra_files)
         r = codejail.jail_code.jail_code(self.codejail_python, files=files, extra_files=extra_files, argv=argv)
+        print("result", r.status, r.stdout, r.stderr)
         return r
 
+    def read_answer_file(self, answer_path):
+        print("answer path", answer_path)
+        with open(answer_path, 'rb') as f:
+            return f.read().decode('utf-8')
+
     def grade(self, grader_path, grader_config, submission):
         if type(submission) != six.text_type:
             self.log.warning("Submission is NOT unicode")
@@ -116,9 +127,24 @@ def grade(self, grader_path, grader_config, submission):
 
         self._enable_i18n(grader_config.get("lang", LANGUAGE))
 
-        answer_path = Path(grader_path).dirname() / 'answer.py'
-        with open(answer_path, 'rb') as f:
-            answer = f.read().decode('utf-8')
+        print("__DEBUG__")
+        print("grader path", grader_path)
+
+        answers = []
+        try:
+            answer_dir_files = os.listdir(os.path.dirname(grader_path))
+            answer_files = list(filter(lambda f: f.lower().startswith('answer') and f.endswith('.py'), answer_dir_files))
+            answers = [Path(grader_path).dirname() + '/' + a for a in answer_files]
+            print("Multiple answer files: " + str(answers))
+        except Exception:
+            print(traceback.format_exc())
+            answers = [Path(grader_path).dirname() + '/answer.py']
+
+        read_answer_files = [self.read_answer_file(f) for f in answers]
+        #answer_path = Path(grader_path).dirname() / 'answer.py'
+        #print("answer path", answer_path)
+        #with open(answer_path, 'rb') as f:
+            #answer = f.read().decode('utf-8')
 
         # Import the grader, straight from the original file.  (It probably isn't in
         # sys.path, and we may be in a long running gunicorn process, so we don't
@@ -133,23 +159,41 @@ def grade(self, grader_path, grader_config, submission):
             # Don't run tests if there were errors
             return results
 
+        if grader.only_check_input():
+            results['correct'] = True
+            results['score'] = 1
+            self.log.debug('Only checking inputs, returning correct.')
+            return results
+
         # Add a unicode encoding declaration.
-        processed_answer = prepend_coding(grader.preprocess(answer))
+        #processed_answer = prepend_coding(grader.preprocess(answer))
+        processed_answers = [prepend_coding(grader.preprocess(a)) for a in read_answer_files]
         processed_submission = prepend_coding(grader.preprocess(submission))
 
+        #print("processed answer", processed_answer)
+        for a in processed_answers:
+            print("processed answer", a)
+        print("processed_submission", processed_submission)
+
         # Same seed for both runs
         seed = str(random.randint(0, 20000))
 
         # Run the official answer, to get the expected output.
         expected_ok = False
         expected_exc = None
+        expected_solutions = []
         try:
             # If we want a factor of two speedup for now: trust the staff solution to
             # avoid hitting the sandbox. (change run to run_trusted)
             expected_outputs = None  # in case run_trusted raises an exception.
-            expected_outputs = self._run(grader_path, processed_answer, seed).stdout
+            #expected_outputs = self._run(grader_path, processed_answer, seed).stdout
+            expected_outputs = [self._run(grader_path, a, seed).stdout for a in processed_answers]
+            print("expected_outputs", expected_outputs)
             if expected_outputs:
-                expected = json.loads(expected_outputs.decode('utf-8'))
+                for o in expected_outputs:
+                    if o:
+                        expected = json.loads(o.decode('utf-8'))
+                        expected_solutions.append(expected)
                 expected_ok = True
         except Exception:
             expected_exc = sys.exc_info()
@@ -205,42 +249,63 @@ def grade(self, grader_path, grader_config, submission):
         # Compare actual and expected through the grader tests, but only if we haven't
         # already found a problem.
         corrects = []
+        tests_dont_match_up = []
         if not results['errors']:
-            expected_results = expected['results']
-            actual_results = actual['results']
-            if len(expected_results) != len(actual_results):
+            is_num_results_diff = True
+            for expected in expected_solutions:
+                expected_results = expected['results']
+                actual_results = actual['results']
+                if len(expected_results) == len(actual_results):
+                    is_num_results_diff = False
+                    break
+            if is_num_results_diff:
                 results['errors'].append(_('Something went wrong: different numbers of '
                                          'tests ran for your code and for our reference code.'))
                 return results
 
-            for test, exp, act in zip(grader.tests(), expected_results, actual_results):
-                exp_short_desc, exp_long_desc, exp_output = exp
-                act_short_desc, act_long_desc, act_output = act
-                if exp_short_desc != act_short_desc:
-                    results['errors'].append(_("Something went wrong: tests don't match up."))
-                    # TODO: don't give up so easily?
-                    return results
-                # Truncate here--we don't want to send long output back, and also don't want to
-                # confuse students by comparing the full output but sending back truncated output.
-                act_output = truncate(act_output)
-                try:
-                    correct = test.compare_results(exp_output, act_output)
-                except EndTest as e:
-                    # Allows a grader's compare_results function to raise an EndTest exception
-                    # (defined in gradelib.py). This enables the checker to print out an error
-                    # message to the student, which will be appended to the end of stdout.
-                    if e is not None:
-                        act_output += '\n'
-                        error_msg = _("ERROR")
-                        act_output += "*** {error_msg}: {error_detail} ***".format(
-                            error_msg=error_msg,
-                            error_detail=e
-                        )
-                    correct = False
-                corrects.append(correct)
-                if not grader_config.get("hide_output", False):
-                    results['tests'].append((exp_short_desc, exp_long_desc,
-                                            correct, exp_output, act_output))
+            final_results = results
+            for expected in expected_solutions:
+                corrects = []
+                results = final_results
+                expected_results = expected['results']
+                actual_results = actual['results']
+                for test, exp, act in zip(grader.tests(), expected_results, actual_results):
+                    exp_short_desc, exp_long_desc, exp_output = exp
+                    act_short_desc, act_long_desc, act_output = act
+                    tests_dont_match_up.append(exp_short_desc != act_short_desc)
+                    if exp_short_desc != act_short_desc:
+                        #results['errors'].append(_("Something went wrong: tests don't match up."))
+                        # TODO: don't give up so easily?
+                        #return results
+                        # Jump to next solution, no need to compare results here
+                        next
+                    # Truncate here--we don't want to send long output back, and also don't want to
+                    # confuse students by comparing the full output but sending back truncated output.
+                    act_output = truncate(act_output)
+                    try:
+                        correct = test.compare_results(exp_output, act_output)
+                    except EndTest as e:
+                        # Allows a grader's compare_results function to raise an EndTest exception
+                        # (defined in gradelib.py). This enables the checker to print out an error
+                        # message to the student, which will be appended to the end of stdout.
+                        if e is not None:
+                            act_output += '\n'
+                            error_msg = _("ERROR")
+                            act_output += "*** {error_msg}: {error_detail} ***".format(
+                                error_msg=error_msg,
+                                error_detail=e
+                            )
+                        correct = False
+                    corrects.append(correct)
+                    if not grader_config.get("hide_output", False):
+                        results['tests'].append((exp_short_desc, exp_long_desc,
+                                                correct, exp_output, act_output))
+                if len(corrects) > 0 and all(corrects):
+                    break # This solution works, short-circuit here
+        # All solutions ran into "tests don't match up" problem
+        if all(tests_dont_match_up):
+            results['errors'].append(_("Something went wrong: tests don't match up."))
+            return results
 
         # If there were no tests run, then there was probably an error, so it's incorrect
         n = len(corrects)
@@ -272,15 +337,16 @@ def main(args):     # pragma: no cover
     if len(args) != 2:
         return
 
-    configure("python", sys.executable, user=getpass.getuser())
+    configure('python3', sys.executable, user=getpass.getuser())
     (grader_path, submission_path) = args
 
     with open(submission_path) as f:
-        submission = f.read().decode('utf-8')
+        submission = f.read()   # .decode('utf-8')
 
     grader_config = {"lang": "eo"}
-    grader_path = path(grader_path).abspath()
-    g = JailedGrader(grader_root=grader_path.dirname().parent.parent)
+    grader_path = Path(grader_path).abspath()
+    g = JailedGrader(grader_root=grader_path.dirname().parent.parent,
+                     codejail_python='python3')
     pprint(g.grade(grader_path, grader_config, submission))
 
 
diff --git a/xqueue_watcher/jailedgrader.py.2022-03-20-fully-functional-single-answer b/xqueue_watcher/jailedgrader.py.2022-03-20-fully-functional-single-answer
new file mode 100644
index 0000000..0799a84
--- /dev/null
+++ b/xqueue_watcher/jailedgrader.py.2022-03-20-fully-functional-single-answer
@@ -0,0 +1,307 @@
+"""
+An implementation of a grader that uses codejail to sandbox submission execution.
+"""
+import codecs
+import os
+import sys
+import imp
+import json
+import random
+import gettext
+from path import Path
+import six
+
+import codejail
+
+from grader_support.gradelib import EndTest
+from grader_support.graderutil import LANGUAGE
+import grader_support
+
+from .grader import Grader
+from six.moves import zip
+
+TIMEOUT = 1
+
+def path_to_six():
+    """
+    Return the full path to six.py
+    """
+    if any(six.__file__.endswith(suffix) for suffix in ('.pyc', '.pyo')):
+        # __file__ points to the compiled bytecode in python 2
+        return Path(six.__file__[:-1])
+    else:
+        # __file__ points to the .py file in python 3
+        return Path(six.__file__)
+
+
+SUPPORT_FILES = [
+    Path(grader_support.__file__).dirname(),
+    path_to_six(),
+]
+
+
+def truncate(out):
+    """
+    Truncate test output that's too long.  This is per-test.
+    """
+    TOO_LONG = 5000    # 5K bytes seems like enough for a single test.
+    if len(out) > TOO_LONG:
+        out = out[:TOO_LONG] + "...OUTPUT TRUNCATED"
+
+    return out
+
+
+def prepend_coding(code):
+    """
+    Add a coding line--makes submissions with inline unicode not
+    explode (as long as they're utf8, I guess)
+    """
+    return '# coding: utf8\n' + code
+
+
+class JailedGrader(Grader):
+    """
+    A grader implementation that uses codejail.
+    Instantiate it with grader_root="path/to/graders"
+    and optionally codejail_python="python name" (the name that you used to configure codejail)
+    """
+    def __init__(self, *args, **kwargs):
+        self.codejail_python = kwargs.pop("codejail_python", "python")
+        super(JailedGrader, self).__init__(*args, **kwargs)
+        self.locale_dir = self.grader_root / "conf" / "locale"
+        self.fork_per_item = False  # it's probably safe not to fork
+        # EDUCATOR-3368: OpenBLAS library is allowed to allocate 1 thread
+        os.environ["OPENBLAS_NUM_THREADS"] = "1"
+
+    def _enable_i18n(self, language):
+        trans = gettext.translation('graders', localedir=self.locale_dir, fallback=True, languages=[language])
+        trans.install(names=None)
+
+    def _run(self, grader_path, thecode, seed):
+        print("--- RUN CODE ---", grader_path, thecode)
+        files = SUPPORT_FILES + [grader_path]
+        if self.locale_dir.exists():
+            files.append(self.locale_dir)
+        extra_files = [('submission.py', thecode.encode('utf-8'))]
+        argv = ["-m", "grader_support.run", Path(grader_path).basename(), 'submission.py', seed]
+        print("argv -- ", argv)
+        print("files", files)
+        print("extra_files", extra_files)
+        r = codejail.jail_code.jail_code(self.codejail_python, files=files, extra_files=extra_files, argv=argv)
+        print("result", r.status, r.stdout, r.stderr)
+        return r
+
+    def grade(self, grader_path, grader_config, submission):
+        if type(submission) != six.text_type:
+            self.log.warning("Submission is NOT unicode")
+
+        results = {
+            'errors': [],
+            'tests': [],
+            'correct': False,
+            'score': 0,
+        }
+
+        # There are some cases where the course team would like to accept a
+        # student submission but not process the student code. Some examples are
+        # cases where the problem would require dependencies that are difficult
+        # or impractical to install in a sandbox or if the complexity of the
+        # solution would cause the runtime of the student code to exceed what is
+        # possible in the sandbox.
+
+        # skip_grader is a flag in the grader config which is a boolean. If it
+        # is set to true on a problem then it will always show that the
+        # submission is correct and give the student a full score for the
+        # problem.
+        if grader_config.get('skip_grader', False):
+            results['correct'] = True
+            results['score'] = 1
+            self.log.debug('Skipping the grader.')
+            return results
+
+        self._enable_i18n(grader_config.get("lang", LANGUAGE))
+
+        print("__DEBUG__")
+        print("grader path", grader_path)
+
+        answer_path = Path(grader_path).dirname() / 'answer.py'
+        print("answer path", answer_path)
+        with open(answer_path, 'rb') as f:
+            answer = f.read().decode('utf-8')
+
+        # Import the grader, straight from the original file.  (It probably isn't in
+        # sys.path, and we may be in a long running gunicorn process, so we don't
+        # want to add stuff to sys.path either.)
+        grader_module = imp.load_source("grader_module", six.text_type(grader_path))
+        grader = grader_module.grader
+
+        # Preprocess for grader-specified errors
+        errors = grader.input_errors(submission)
+        if errors != []:
+            results['errors'].extend(errors)
+            # Don't run tests if there were errors
+            return results
+
+        if grader.only_check_input():
+            results['correct'] = True
+            results['score'] = 1
+            self.log.debug('Only checking inputs, returning correct.')
+            return results
+
+        # Add a unicode encoding declaration.
+        processed_answer = prepend_coding(grader.preprocess(answer))
+        processed_submission = prepend_coding(grader.preprocess(submission))
+
+        print("processed answer", processed_answer)
+        print("processed_submission", processed_submission)
+
+        # Same seed for both runs
+        seed = str(random.randint(0, 20000))
+
+        # Run the official answer, to get the expected output.
+        expected_ok = False
+        expected_exc = None
+        try:
+            # If we want a factor of two speedup for now: trust the staff solution to
+            # avoid hitting the sandbox. (change run to run_trusted)
+            expected_outputs = None  # in case run_trusted raises an exception.
+            expected_outputs = self._run(grader_path, processed_answer, seed).stdout
+            print("expected_outputs", expected_outputs)
+            if expected_outputs:
+                expected = json.loads(expected_outputs.decode('utf-8'))
+                expected_ok = True
+        except Exception:
+            expected_exc = sys.exc_info()
+        else:
+            # We just ran the official answer, nothing should have gone wrong, so check
+            # everything, and note it as bad if anything is wrong.
+            if expected_ok:
+                if expected['exceptions'] \
+                        or expected['grader']['status'] != 'ok' \
+                        or expected['submission']['status'] != 'ok':
+                    expected_ok = False
+
+        if not expected_ok:
+            # We couldn't run the official answer properly, bail out, but don't show
+            # details to the student, since none of it is their code.
+            results['errors'].append(_('There was a problem running the staff solution (Staff debug: L364)'))
+            self.log.error("Couldn't run staff solution. grader = %s, output: %r",
+                           grader_path, expected_outputs, exc_info=expected_exc)
+            return results
+
+        # The expected code ran fine, go ahead and run the student submission.
+        actual_ok = False
+        actual_exc = None
+        try:
+            # Do NOT trust the student solution (in production).
+            actual_outputs = None   # in case run raises an exception.
+            actual_outputs = self._run(grader_path, processed_submission, seed).stdout
+            if actual_outputs:
+                actual = json.loads(actual_outputs.decode('utf-8'))
+                actual_ok = True
+            else:
+                results['errors'].append(_("There was a problem running your solution (Staff debug: L379)."))
+        except Exception:
+            actual_exc = sys.exc_info()
+        else:
+            if actual_ok and actual['grader']['status'] == 'ok':
+                if actual['submission']['status'] != 'ok':
+                    # The grader ran OK, but the student code didn't, so show the student
+                    # details of what went wrong.  There is probably an exception to show.
+                    shown_error = actual['submission']['exception'] or _('There was an error thrown while running your solution.')
+                    results['errors'].append(shown_error)
+            else:
+                # The grader didn't run well, we are going to bail.
+                actual_ok = False
+
+        # If something went wrong, then don't continue
+        if not actual_ok:
+            results['errors'].append(_("We couldn't run your solution (Staff debug: L397)."))
+            self.log.error("Couldn't run student solution. grader = %s, output: %r",
+                           grader_path, actual_outputs, exc_info=actual_exc)
+            return results
+
+        # Compare actual and expected through the grader tests, but only if we haven't
+        # already found a problem.
+        corrects = []
+        if not results['errors']:
+            expected_results = expected['results']
+            actual_results = actual['results']
+            if len(expected_results) != len(actual_results):
+                results['errors'].append(_('Something went wrong: different numbers of '
+                                         'tests ran for your code and for our reference code.'))
+                return results
+
+            for test, exp, act in zip(grader.tests(), expected_results, actual_results):
+                exp_short_desc, exp_long_desc, exp_output = exp
+                act_short_desc, act_long_desc, act_output = act
+                if exp_short_desc != act_short_desc:
+                    results['errors'].append(_("Something went wrong: tests don't match up."))
+                    # TODO: don't give up so easily?
+                    return results
+                # Truncate here--we don't want to send long output back, and also don't want to
+                # confuse students by comparing the full output but sending back truncated output.
+                act_output = truncate(act_output)
+                try:
+                    correct = test.compare_results(exp_output, act_output)
+                except EndTest as e:
+                    # Allows a grader's compare_results function to raise an EndTest exception
+                    # (defined in gradelib.py). This enables the checker to print out an error
+                    # message to the student, which will be appended to the end of stdout.
+                    if e is not None:
+                        act_output += '\n'
+                        error_msg = _("ERROR")
+                        act_output += "*** {error_msg}: {error_detail} ***".format(
+                            error_msg=error_msg,
+                            error_detail=e
+                        )
+                    correct = False
+                corrects.append(correct)
+                if not grader_config.get("hide_output", False):
+                    results['tests'].append((exp_short_desc, exp_long_desc,
+                                            correct, exp_output, act_output))
+
+        # If there were no tests run, then there was probably an error, so it's incorrect
+        n = len(corrects)
+        results['correct'] = all(corrects) and n > 0
+        results['score'] = float(sum(corrects))/n if n > 0 else 0
+
+        if n == 0 and len(results['errors']) == 0:
+            results['errors'] = [
+                _("There was a problem while running your code (Staff debug: L450). "
+                  "Please contact the course staff for assistance.")
+            ]
+
+        return results
+
+
+def main(args):     # pragma: no cover
+    """
+    Prints a json list:
+    [ ("Test description", "value") ]
+
+    TODO: what about multi-file submission?
+    """
+    import logging
+    from pprint import pprint
+    from codejail.jail_code import configure
+    import getpass
+
+    logging.basicConfig(level=logging.DEBUG)
+    if len(args) != 2:
+        return
+
+    configure("python", sys.executable, user=getpass.getuser())
+    (grader_path, submission_path) = args
+
+    with open(submission_path) as f:
+        submission = f.read().decode('utf-8')
+
+    grader_config = {"lang": "eo"}
+    grader_path = path(grader_path).abspath()
+    g = JailedGrader(grader_root=grader_path.dirname().parent.parent)
+    pprint(g.grade(grader_path, grader_config, submission))
+
+
+if __name__ == '__main__':      # pragma: no cover
+    main(sys.argv[1:])
diff --git a/xqueue_watcher/jailedgrader.py.bak.2021-04-09-original b/xqueue_watcher/jailedgrader.py.bak.2021-04-09-original
new file mode 100644
index 0000000..c804280
--- /dev/null
+++ b/xqueue_watcher/jailedgrader.py.bak.2021-04-09-original
@@ -0,0 +1,301 @@
+"""
+An implementation of a grader that uses codejail to sandbox submission execution.
+"""
+import codecs
+import os
+import sys
+import imp
+import json
+import random
+import gettext
+from path import Path
+import six
+
+import codejail
+
+from grader_support.gradelib import EndTest
+from grader_support.graderutil import LANGUAGE
+import grader_support
+
+from .grader import Grader
+from six.moves import zip
+
+TIMEOUT = 1
+
+def path_to_six():
+    """
+    Return the full path to six.py
+    """
+    if any(six.__file__.endswith(suffix) for suffix in ('.pyc', '.pyo')):
+        # __file__ points to the compiled bytecode in python 2
+        return Path(six.__file__[:-1])
+    else:
+        # __file__ points to the .py file in python 3
+        return Path(six.__file__)
+
+
+SUPPORT_FILES = [
+    Path(grader_support.__file__).dirname(),
+    path_to_six(),
+]
+
+
+def truncate(out):
+    """
+    Truncate test output that's too long.  This is per-test.
+    """
+    TOO_LONG = 5000    # 5K bytes seems like enough for a single test.
+    if len(out) > TOO_LONG:
+        out = out[:TOO_LONG] + "...OUTPUT TRUNCATED"
+
+    return out
+
+
+def prepend_coding(code):
+    """
+    Add a coding line--makes submissions with inline unicode not
+    explode (as long as they're utf8, I guess)
+    """
+    return '# coding: utf8\n' + code
+
+
+class JailedGrader(Grader):
+    """
+    A grader implementation that uses codejail.
+    Instantiate it with grader_root="path/to/graders"
+    and optionally codejail_python="python name" (the name that you used to configure codejail)
+    """
+    def __init__(self, *args, **kwargs):
+        self.codejail_python = kwargs.pop("codejail_python", "python")
+        super(JailedGrader, self).__init__(*args, **kwargs)
+        self.locale_dir = self.grader_root / "conf" / "locale"
+        self.fork_per_item = False  # it's probably safe not to fork
+        # EDUCATOR-3368: OpenBLAS library is allowed to allocate 1 thread
+        os.environ["OPENBLAS_NUM_THREADS"] = "1"
+
+    def _enable_i18n(self, language):
+        trans = gettext.translation('graders', localedir=self.locale_dir, fallback=True, languages=[language])
+        trans.install(names=None)
+
+    def _run(self, grader_path, thecode, seed):
+        print("--- RUN CODE ---", grader_path, thecode)
+        files = SUPPORT_FILES + [grader_path]
+        if self.locale_dir.exists():
+            files.append(self.locale_dir)
+        extra_files = [('submission.py', thecode.encode('utf-8'))]
+        argv = ["-m", "grader_support.run", Path(grader_path).basename(), 'submission.py', seed]
+        print("argv -- ", argv)
+        print("files", files)
+        print("extra_files", extra_files)
+        r = codejail.jail_code.jail_code(self.codejail_python, files=files, extra_files=extra_files, argv=argv)
+        print("result", r.status, r.stdout, r.stderr)
+        return r
+
+    def grade(self, grader_path, grader_config, submission):
+        if type(submission) != six.text_type:
+            self.log.warning("Submission is NOT unicode")
+
+        results = {
+            'errors': [],
+            'tests': [],
+            'correct': False,
+            'score': 0,
+        }
+
+        # There are some cases where the course team would like to accept a
+        # student submission but not process the student code. Some examples are
+        # cases where the problem would require dependencies that are difficult
+        # or impractical to install in a sandbox or if the complexity of the
+        # solution would cause the runtime of the student code to exceed what is
+        # possible in the sandbox.
+
+        # skip_grader is a flag in the grader config which is a boolean. If it
+        # is set to true on a problem then it will always show that the
+        # submission is correct and give the student a full score for the
+        # problem.
+        if grader_config.get('skip_grader', False):
+            results['correct'] = True
+            results['score'] = 1
+            self.log.debug('Skipping the grader.')
+            return results
+
+        self._enable_i18n(grader_config.get("lang", LANGUAGE))
+
+        print("__DEBUG__")
+        print("grader path", grader_path)
+
+        answer_path = Path(grader_path).dirname() / 'answer.py'
+        print("answer path", answer_path)
+        with open(answer_path, 'rb') as f:
+            answer = f.read().decode('utf-8')
+
+        # Import the grader, straight from the original file.  (It probably isn't in
+        # sys.path, and we may be in a long running gunicorn process, so we don't
+        # want to add stuff to sys.path either.)
+        grader_module = imp.load_source("grader_module", six.text_type(grader_path))
+        grader = grader_module.grader
+
+        # Preprocess for grader-specified errors
+        errors = grader.input_errors(submission)
+        if errors != []:
+            results['errors'].extend(errors)
+            # Don't run tests if there were errors
+            return results
+
+        # Add a unicode encoding declaration.
+        processed_answer = prepend_coding(grader.preprocess(answer))
+        processed_submission = prepend_coding(grader.preprocess(submission))
+
+        print("processed answer", processed_answer)
+        print("processed_submission", processed_submission)
+
+        # Same seed for both runs
+        seed = str(random.randint(0, 20000))
+
+        # Run the official answer, to get the expected output.
+        expected_ok = False
+        expected_exc = None
+        try:
+            # If we want a factor of two speedup for now: trust the staff solution to
+            # avoid hitting the sandbox. (change run to run_trusted)
+            expected_outputs = None  # in case run_trusted raises an exception.
+            expected_outputs = self._run(grader_path, processed_answer, seed).stdout
+            print("expected_outputs", expected_outputs)
+            if expected_outputs:
+                expected = json.loads(expected_outputs.decode('utf-8'))
+                expected_ok = True
+        except Exception:
+            expected_exc = sys.exc_info()
+        else:
+            # We just ran the official answer, nothing should have gone wrong, so check
+            # everything, and note it as bad if anything is wrong.
+            if expected_ok:
+                if expected['exceptions'] \
+                        or expected['grader']['status'] != 'ok' \
+                        or expected['submission']['status'] != 'ok':
+                    expected_ok = False
+
+        if not expected_ok:
+            # We couldn't run the official answer properly, bail out, but don't show
+            # details to the student, since none of it is their code.
+            results['errors'].append(_('There was a problem running the staff solution (Staff debug: L364)'))
+            self.log.error("Couldn't run staff solution. grader = %s, output: %r",
+                           grader_path, expected_outputs, exc_info=expected_exc)
+            return results
+
+        # The expected code ran fine, go ahead and run the student submission.
+        actual_ok = False
+        actual_exc = None
+        try:
+            # Do NOT trust the student solution (in production).
+            actual_outputs = None   # in case run raises an exception.
+            actual_outputs = self._run(grader_path, processed_submission, seed).stdout
+            if actual_outputs:
+                actual = json.loads(actual_outputs.decode('utf-8'))
+                actual_ok = True
+            else:
+                results['errors'].append(_("There was a problem running your solution (Staff debug: L379)."))
+        except Exception:
+            actual_exc = sys.exc_info()
+        else:
+            if actual_ok and actual['grader']['status'] == 'ok':
+                if actual['submission']['status'] != 'ok':
+                    # The grader ran OK, but the student code didn't, so show the student
+                    # details of what went wrong.  There is probably an exception to show.
+                    shown_error = actual['submission']['exception'] or _('There was an error thrown while running your solution.')
+                    results['errors'].append(shown_error)
+            else:
+                # The grader didn't run well, we are going to bail.
+                actual_ok = False
+
+        # If something went wrong, then don't continue
+        if not actual_ok:
+            results['errors'].append(_("We couldn't run your solution (Staff debug: L397)."))
+            self.log.error("Couldn't run student solution. grader = %s, output: %r",
+                           grader_path, actual_outputs, exc_info=actual_exc)
+            return results
+
+        # Compare actual and expected through the grader tests, but only if we haven't
+        # already found a problem.
+        corrects = []
+        if not results['errors']:
+            expected_results = expected['results']
+            actual_results = actual['results']
+            if len(expected_results) != len(actual_results):
+                results['errors'].append(_('Something went wrong: different numbers of '
+                                         'tests ran for your code and for our reference code.'))
+                return results
+
+            for test, exp, act in zip(grader.tests(), expected_results, actual_results):
+                exp_short_desc, exp_long_desc, exp_output = exp
+                act_short_desc, act_long_desc, act_output = act
+                if exp_short_desc != act_short_desc:
+                    results['errors'].append(_("Something went wrong: tests don't match up."))
+                    # TODO: don't give up so easily?
+                    return results
+                # Truncate here--we don't want to send long output back, and also don't want to
+                # confuse students by comparing the full output but sending back truncated output.
+                act_output = truncate(act_output)
+                try:
+                    correct = test.compare_results(exp_output, act_output)
+                except EndTest as e:
+                    # Allows a grader's compare_results function to raise an EndTest exception
+                    # (defined in gradelib.py). This enables the checker to print out an error
+                    # message to the student, which will be appended to the end of stdout.
+                    if e is not None:
+                        act_output += '\n'
+                        error_msg = _("ERROR")
+                        act_output += "*** {error_msg}: {error_detail} ***".format(
+                            error_msg=error_msg,
+                            error_detail=e
+                        )
+                    correct = False
+                corrects.append(correct)
+                if not grader_config.get("hide_output", False):
+                    results['tests'].append((exp_short_desc, exp_long_desc,
+                                            correct, exp_output, act_output))
+
+        # If there were no tests run, then there was probably an error, so it's incorrect
+        n = len(corrects)
+        results['correct'] = all(corrects) and n > 0
+        results['score'] = float(sum(corrects))/n if n > 0 else 0
+
+        if n == 0 and len(results['errors']) == 0:
+            results['errors'] = [
+                _("There was a problem while running your code (Staff debug: L450). "
+                  "Please contact the course staff for assistance.")
+            ]
+
+        return results
+
+
+def main(args):     # pragma: no cover
+    """
+    Prints a json list:
+    [ ("Test description", "value") ]
+
+    TODO: what about multi-file submission?
+    """
+    import logging
+    from pprint import pprint
+    from codejail.jail_code import configure
+    import getpass
+
+    logging.basicConfig(level=logging.DEBUG)
+    if len(args) != 2:
+        return
+
+    configure("python", sys.executable, user=getpass.getuser())
+    (grader_path, submission_path) = args
+
+    with open(submission_path) as f:
+        submission = f.read().decode('utf-8')
+
+    grader_config = {"lang": "eo"}
+    grader_path = path(grader_path).abspath()
+    g = JailedGrader(grader_root=grader_path.dirname().parent.parent)
+    pprint(g.grade(grader_path, grader_config, submission))
+
+
+if __name__ == '__main__':      # pragma: no cover
+    main(sys.argv[1:])
diff --git a/xqueue_watcher/manager.py b/xqueue_watcher/manager.py
index db64152..63b2dd1 100644
--- a/xqueue_watcher/manager.py
+++ b/xqueue_watcher/manager.py
@@ -70,7 +70,9 @@ def configure(self, configuration):
         """
         Configure XQueue clients.
         """
+        print(configuration)
         for queue_name, config in configuration.items():
+            print(config)
             for i in range(config.get('CONNECTIONS', 1)):
                 watcher = self.client_from_config(queue_name, config)
                 self.clients.append(watcher)