diff --git a/mglib/mglib.py b/mglib/mglib.py index d52568b..98e2490 100644 --- a/mglib/mglib.py +++ b/mglib/mglib.py @@ -6,7 +6,6 @@ import base64 import json import string -import time import random import hashlib import subprocess @@ -24,7 +23,7 @@ from .__init__ import API_URL -if not sys.version_info[0:2][0] == 3 and not sys.version_info[0:2] == (2, 7) : +if not sys.version_info[0:2][0] == 3 and not sys.version_info[0:2] == (2, 7): sys.stderr.write('ERROR: MG-RAST Tools requires at least Python 2.7.') exit(1) @@ -65,7 +64,7 @@ def body_from_url(url, accept, auth=None, data=None, debug=False, method=None): except: sys.stderr.write("ERROR (%s): %s\n" %(error.code, error.read().decode("utf8"))) finally: - raise(HTTPerror) + raise HTTPError(error.url, error.code, "HTTP error", error.hdrs, error.fp) if not res: sys.stderr.write("ERROR: no results returned\n") sys.exit(1) @@ -75,18 +74,22 @@ def body_from_url(url, accept, auth=None, data=None, debug=False, method=None): def obj_from_url(url, auth=None, data=None, debug=False, method=None): url = quote(url, safe='/:=?&', encoding="utf-8", errors="strict") if type(data) is str: - data=data.encode("utf8") + data = data.encode("utf8") + if debug: + print("Data", repr(data)) try: result = body_from_url(url, 'application/json', auth=auth, data=data, debug=debug, method=method) read = result.read() except: # try one more time ConnectionResetError is incompatible with python2 result = body_from_url(url, 'application/json', auth=auth, data=data, debug=debug, method=method) read = result.read() + if debug: + print("Result", repr(read)) if result.headers["content-type"] == "application/x-download" or result.headers["content-type"] == "application/octet-stream": return(read) # Watch out! if result.headers["content-type"][0:9] == "text/html": # json decoder won't work return(read) # Watch out! - if result.headers["content-type"] == "application/json": # If header is set, this should work + if result.headers["content-type"] == "application/json": # If header is set, this should work data = read.decode("utf8") obj = json.loads(data) else: @@ -139,8 +142,8 @@ def async_rest_api(url, auth=None, data=None, debug=False, delay=60): # If "status" is nor present, or if "status" is somehow not "submitted" # assume this is not an asynchronous call and it's done. if type(submit) == bytes: # can't decode - try: - return decode("utf-8", submit) + try: + return submit.decode("utf-8") except: return submit if ('status' in submit) and (submit['status'] != 'submitted') and (submit['status'] != "processing") and ('data' in submit): @@ -198,11 +201,12 @@ def post_file(url, keyname, filename, data={}, auth=None, debug=False): obj = None # try maxt times - while not success and counter < maxt : + while not success and counter < maxt: try: res = requests.post(url, data=datagen, headers=header, stream=True) except HTTPError as error: try: + sys.stderr.write("Retrying POST "+url, repr(datagen), repr(header)) eobj = json.loads(error.read()) if 'ERROR' in eobj: sys.stderr.write("ERROR (%s): %s\n" %(error.code, eobj['ERROR'])) @@ -213,13 +217,13 @@ def post_file(url, keyname, filename, data={}, auth=None, debug=False): finally: # sys.exit(1) return None - except OSError as error: + except OSError as error: sys.stderr.write("ERROR with post_file\n") sys.stderr.write("ERROR (%s): %s\n" %(error.code, error.read())) if not res: sys.stderr.write("ERROR: no results returned for %s\n"% (filename)) # sys.exit(1) - else: + else: obj = json.loads(res.content.decode("utf8")) if debug: print(json.dumps(obj)) @@ -228,7 +232,7 @@ def post_file(url, keyname, filename, data={}, auth=None, debug=False): else: success = True # increase counter - if not success : + if not success: counter += 1 time.sleep(counter * sleep) return(obj) @@ -258,9 +262,34 @@ def sparse_to_dense(sMatrix, rmax, cmax): dMatrix[r][c] = v return dMatrix +def clean_row(element): + a = ["domain", "phylum", "family" , "class", "order", "genus", "species"] + b = [""] * len(a) + if element["metadata"] is None: + b[-1] = element["id"] + return(";".join(b)) + else: + if ('ontology' in element['metadata'].keys()): + name = ';'.join(element['metadata']['ontology']) + if ('hierarchy' in element['metadata'].keys()): + if "level1" in element['metadata']["hierarchy"].keys(): + a = ["level1", "level2", "level3" , "level4", "function"] + else: + a = ["domain", "phylum", "family" , "class", "order", "genus", "species"] + b = [""] * len(a) + for k,v in element['metadata']["hierarchy"].items(): + b[a.index(k)] = v + name = ';'.join(b) + return(name) + # transform BIOM format to tabbed table # returns max value of matrix def biom_to_tab(biom, hdl, rows=None, use_id=True, col_name=False): + ''' biom + hdl + rows + use_id + col_name ''' assert 'matrix_type' in biom.keys(), repr(biom) if biom['matrix_type'] == 'sparse': matrix = sparse_to_dense(biom['data'], biom['shape'][0], biom['shape'][1]) @@ -273,8 +302,10 @@ def biom_to_tab(biom, hdl, rows=None, use_id=True, col_name=False): rowmax = [] for i, row in enumerate(matrix): name = biom['rows'][i]['id'] - if (not use_id) and ('ontology' in biom['rows'][i]['metadata']): - name += ':'+biom['rows'][i]['metadata']['ontology'][-1] + if use_id: + name = biom['rows'][i]["id"] # Use row[].id + else: + name = clean_row(biom['rows'][i]) if rows and (name not in rows): continue try: @@ -388,7 +419,7 @@ def merge_biom(b1, b2): add_row.append(b2['data'][i][j]) mBiom['rows'].append(r) mBiom['data'].append(add_row) - mBiom['shape'] = [ len(mBiom['rows']), len(mBiom['columns']) ] + mBiom['shape'] = [len(mBiom['rows']), len(mBiom['columns'])] return mBiom # transform BIOM format to matrix in json format @@ -410,9 +441,9 @@ def biom_to_matrix(biom, col_name=False, sig_stats=False): else: data = biom['data'] if sig_stats and ('significance' in biom['rows'][0]['metadata']) and (len(biom['rows'][0]['metadata']['significance']) > 0): - cols.extend([s[0] for s in biom['rows'][0]['metadata']['significance']] ) + cols.extend([s[0] for s in biom['rows'][0]['metadata']['significance']]) for i, r in enumerate(biom['rows']): - data[i].extend([s[1] for s in r['metadata']['significance']] ) + data[i].extend([s[1] for s in r['metadata']['significance']]) return rows, cols, data # transform tabbed table to matrix in json format @@ -435,7 +466,7 @@ def sub_matrix(matrix, ncols): return matrix sub = list() for row in matrix: - sub.append(row[:ncols] ) + sub.append(row[:ncols]) return sub # return KBase id for MG-RAST id @@ -467,7 +498,7 @@ def kbids_to_mgids(kbids): # or reverse def kbid_lookup(ids, reverse=False): request = 'mg2kb' if reverse else 'kb2mg' - post = json.dumps({'ids': ids}, separators=(',',':')) + post = json.dumps({'ids': ids}, separators=(',', ':')) data = obj_from_url(API_URL+'/job/'+request, auth=auth, data=post) return data['data'] @@ -478,7 +509,7 @@ def get_auth_token(opts=None): return os.environ['MGRKEY'] if hasattr(opts, "token") and opts.token is not None: return opts.token - elif hasattr(opts, 'user') and hasattr(opts, 'passwd') and (opts.user or opts.passwd): + if hasattr(opts, 'user') and hasattr(opts, 'passwd') and (opts.user or opts.passwd): if opts.user and opts.passwd: return token_from_login(opts.user, opts.passwd) else: @@ -494,7 +525,7 @@ def get_auth(token): if not os.path.isfile(auth_file): sys.stderr.write("ERROR: missing authentication file, please login\n") return None - auth_obj = json.load(open(auth_file,'r')) + auth_obj = json.load(open(auth_file, 'r')) if ("token" not in auth_obj) or ("id" not in auth_obj) or ("expiration" not in auth_obj): sys.stderr.write("ERROR: invalid authentication file, please login\n") return None @@ -510,7 +541,7 @@ def token_from_login(user, passwd): def login(token): auth_obj = obj_from_url(API_URL+"/user/authenticate", auth=token) - json.dump(auth_obj, open(auth_file,'w')) + json.dump(auth_obj, open(auth_file, 'w')) def login_from_token(token): parts = {} diff --git a/scripts/mg-compare-functions.py b/scripts/mg-compare-functions.py index c7bb653..0e5e162 100755 --- a/scripts/mg-compare-functions.py +++ b/scripts/mg-compare-functions.py @@ -57,9 +57,10 @@ def main(args): parser.add_argument("--evalue", type=int, dest="evalue", default=15, help="negative exponent value for maximum e-value cutoff, default is 15") parser.add_argument("--identity", type=int, dest="identity", default=60, help="percent value for minimum %% identity cutoff, default is 60") parser.add_argument("--length", type=int, dest="length", default=15, help="value for minimum alignment length cutoff, default is 15") + parser.add_argument("--hierarchy", action="store_true", dest="hierarchy", help="Don't use id, show hierarchy") parser.add_argument("--version", type=int, dest="version", default=1, help="M5NR annotation version to use, default is 1") parser.add_argument("--temp", dest="temp", default=None, help="filename to temporarly save biom output at each iteration") - + # get inputs opts = parser.parse_args() if not opts.ids: @@ -74,16 +75,16 @@ def main(args): if opts.format not in ['text', 'biom']: sys.stderr.write("ERROR: invalid input format\n") return 1 - + # get auth token = get_auth_token(opts) - + # build url id_list = [] if os.path.isfile(opts.ids): - id_str = open(opts.ids,'r').read() + id_str = open(opts.ids, 'r').read() try: - id_obj = json.loads(id_str) + id_obj = json.loads(id_str) if 'elements' in id_obj: id_list = id_obj['elements'].keys() elif 'members' in id_obj: @@ -92,14 +93,14 @@ def main(args): id_list = id_str.strip().split('\n') else: id_list = opts.ids.strip().split(',') - params = [ ('group_level', opts.level), - ('source', opts.source), - ('evalue', opts.evalue), - ('identity', opts.identity), - ('length', opts.length), - ('version', opts.version), - ('result_type', 'abundance'), - ('asynchronous', '1') ] + params = [('group_level', opts.level), + ('source', opts.source), + ('evalue', opts.evalue), + ('identity', opts.identity), + ('length', opts.length), + ('version', opts.version), + ('result_type', 'abundance'), + ('asynchronous', '1') ] if opts.intersect_level and opts.intersect_name: params.append(('filter_source', opts.intersect_source)) params.append(('filter_level', opts.intersect_level)) @@ -110,12 +111,12 @@ def main(args): else: for f in opts.intersect_name.strip().split(','): params.append(('filter', f)) - + # retrieve data biom = None size = 50 if len(id_list) > size: - for i in xrange(0, len(id_list), size): + for i in range(0, len(id_list), size): sub_ids = id_list[i:i+size] cur_params = copy.deepcopy(params) for i in sub_ids: @@ -132,8 +133,7 @@ def main(args): biom = async_rest_api(url, auth=token) if opts.temp: json.dump(biom, open(opts.temp, 'w')) - - + # get sub annotations sub_ann = set() if opts.filter_name and opts.filter_level: @@ -147,30 +147,30 @@ def main(args): for f in opts.filter_name.strip().split(','): filter_list.append(f) # annotation mapping from m5nr - params = [ ('version', opts.version), - ('min_level', opts.level), - ('source', opts.source) ] + params = [('version', opts.version), + ('min_level', opts.level), + ('source', opts.source) ] url = opts.url+'/m5nr/ontology?'+urlencode(params, True) data = obj_from_url(url) level = 'level4' if opts.level == 'function' else opts.level for ann in data['data']: if (opts.filter_level in ann) and (level in ann) and (ann[opts.filter_level] in filter_list): sub_ann.add(ann[level]) - + # output data if (not opts.output) or (opts.output == '-'): out_hdl = sys.stdout else: out_hdl = open(opts.output, 'w') - + if opts.format == 'biom': out_hdl.write(json.dumps(biom)+"\n") else: - biom_to_tab(biom["data"], out_hdl, rows=sub_ann) - + biom_to_tab(biom["data"], out_hdl, rows=sub_ann, use_id=not opts.hierarchy) + out_hdl.close() return 0 - + if __name__ == "__main__": sys.exit(main(sys.argv)) diff --git a/scripts/mg-compare-taxa.py b/scripts/mg-compare-taxa.py index ab21696..ac9c05e 100755 --- a/scripts/mg-compare-taxa.py +++ b/scripts/mg-compare-taxa.py @@ -58,9 +58,10 @@ def main(args): parser.add_argument("--evalue", type=int, dest="evalue", default=15, help="negative exponent value for maximum e-value cutoff, default is 15") parser.add_argument("--identity", type=int, dest="identity", default=60, help="percent value for minimum %% identity cutoff, default is 60") parser.add_argument("--length", type=int, dest="length", default=15, help="value for minimum alignment length cutoff, default is 15") + parser.add_argument("--hierarchy", action="store_true", dest="hierarchy", help="Don't use id, show hierarchy") parser.add_argument("--version", type=int, dest="version", default=1, help="M5NR annotation version to use, default is 1") parser.add_argument("--temp", dest="temp", default=None, help="filename to temporarly save biom output at each iteration") - + # get inputs opts = parser.parse_args() if not opts.ids: @@ -75,16 +76,16 @@ def main(args): if opts.format not in ['text', 'biom']: sys.stderr.write("ERROR: invalid input format\n") return 1 - + # get auth token = get_auth_token(opts) - + # build url id_list = [] if os.path.isfile(opts.ids): - id_str = open(opts.ids,'r').read() + id_str = open(opts.ids, 'r').read() try: - id_obj = json.loads(id_str) + id_obj = json.loads(id_str) if 'elements' in id_obj: id_list = id_obj['elements'].keys() elif 'members' in id_obj: @@ -93,15 +94,15 @@ def main(args): id_list = id_str.strip().split('\n') else: id_list = opts.ids.strip().split(',') - params = [ ('group_level', opts.level), - ('source', opts.source), - ('hit_type', opts.hit_type), - ('evalue', opts.evalue), - ('identity', opts.identity), - ('length', opts.length), - ('version', opts.version), - ('result_type', 'abundance'), - ('asynchronous', '1') ] + params = [('group_level', opts.level), + ('source', opts.source), + ('hit_type', opts.hit_type), + ('evalue', opts.evalue), + ('identity', opts.identity), + ('length', opts.length), + ('version', opts.version), + ('result_type', 'abundance'), + ('asynchronous', '1')] if opts.intersect_level and opts.intersect_name: params.append(('filter_source', opts.intersect_source)) params.append(('filter_level', opts.intersect_level)) @@ -117,12 +118,12 @@ def main(args): biom = None size = 50 if len(id_list) > size: - for i in xrange(0, len(id_list), size): + for i in range(0, len(id_list), size): sub_ids = id_list[i:i+size] cur_params = copy.deepcopy(params) for i in sub_ids: cur_params.append(('id', i)) - cur_url = opts.url+'/matrix/organism?'+urlencode(cur_params, True) + cur_url = opts.url+'/matrix/organism?'+urlencode(cur_params, True) cur_biom = async_rest_api(cur_url, auth=token) biom = merge_biom(biom, cur_biom) if opts.temp: @@ -131,10 +132,10 @@ def main(args): for i in id_list: params.append(('id', i)) url = opts.url+'/matrix/organism?'+urlencode(params, True) - biom = async_rest_api(url, auth=token) + biom = async_rest_api(url, auth=token)["data"] if opts.temp: json.dump(biom, open(opts.temp, 'w')) - + # get sub annotations sub_ann = set() if opts.filter_name and opts.filter_level: @@ -148,28 +149,28 @@ def main(args): for f in opts.filter_name.strip().split(','): filter_list.append(f) # annotation mapping from m5nr - params = [ ('version', opts.version), - ('min_level', opts.level) ] + params = [('version', opts.version), + ('min_level', opts.level)] url = opts.url+'/m5nr/taxonomy?'+urlencode(params, True) data = obj_from_url(url) for ann in data['data']: if (opts.filter_level in ann) and (opts.level in ann) and (ann[opts.filter_level] in filter_list): sub_ann.add(ann[opts.level]) - + # output data if (not opts.output) or (opts.output == '-'): out_hdl = sys.stdout else: out_hdl = open(opts.output, 'w') - + if opts.format == 'biom': out_hdl.write(json.dumps(biom)+"\n") else: - biom_to_tab(biom, out_hdl, rows=sub_ann) - + biom_to_tab(biom, out_hdl, rows=sub_ann, use_id=not opts.hierarchy) + out_hdl.close() return 0 - + if __name__ == "__main__": sys.exit(main(sys.argv)) diff --git a/scripts/mg-query.py b/scripts/mg-query.py index 868bae0..7c4deef 100755 --- a/scripts/mg-query.py +++ b/scripts/mg-query.py @@ -6,7 +6,7 @@ import sys from argparse import ArgumentParser import json -from mglib import async_rest_api, get_auth_token +from mglib import async_rest_api, get_auth_token, obj_from_url DEBUG = 0 @@ -14,6 +14,7 @@ usage = "usage: %prog [options] URI" parser = ArgumentParser(usage) parser.add_argument("-v", "--verbose", dest="verbose", action="store_true") + parser.add_argument("-1", "--one", dest="onlyonequery", action="store_true") parser.add_argument("-k", "--token", dest="token", type=str, help="Auth token") parser.add_argument("URI", type=str, help="URI to query") @@ -29,7 +30,10 @@ print(URI, file=sys.stderr) # retrieve the data by sending at HTTP GET request to the MG-RAST API - jsonstructure = async_rest_api(URI, auth=key) + if not opts.onlyonequery: + jsonstructure = async_rest_api(URI, auth=key) + else: + jsonstructure = obj_from_url(URI, auth=key) # unpack and display the data table if type(jsonstructure) == dict: # If we have data, not json structure