diff --git a/openwpm_utils/analysis.py b/openwpm_utils/analysis.py index 684a804..2a2f9f5 100644 --- a/openwpm_utils/analysis.py +++ b/openwpm_utils/analysis.py @@ -83,6 +83,59 @@ def get_script_urls_from_call_stack_as_set(call_stack): return script_urls +def get_ordered_script_urls_from_call_stack(call_stack): + """Return the urls of the scripts involved in the call stack as a + string. Preserve order in which the scripts appear in the call stack.""" + if not call_stack: + return "" + return ", ".join(get_script_urls_from_call_stack_as_list( + call_stack)) + + +def get_script_urls_from_call_stack_as_list(call_stack): + """Return the urls of the scripts involved in the call stack as a list.""" + script_urls = [] + if not call_stack: + return script_urls + stack_frames = call_stack.strip().split("\n") + last_script_url = "" + for stack_frame in stack_frames: + script_url = stack_frame.rsplit(":", 2)[0].\ + split("@")[-1].split(" line")[0] + + if script_url != last_script_url: + script_urls.append(script_url) + last_script_url = script_url + return script_urls + + +def get_set_of_script_ps1s_from_call_stack(script_urls): + if len(script_urls): + return ", ".join( + set((get_ps_plus_1(x) or "") for x in script_urls.split(", "))) + else: + return "" + + +def get_ordered_script_ps1s_from_call_stack(call_stack): + """Return ordered list of script PS1s as they appear in the call stack.""" + return get_ordered_script_ps1s_from_stack_script_urls( + get_ordered_script_urls_from_call_stack(call_stack)) + + +def get_ordered_script_ps1s_from_stack_script_urls(script_urls): + """Return ordered script PS1s as a string given a list of script URLs.""" + script_ps1s = [] + last_ps1 = None + for script_url in script_urls.split(", "): + ps1 = get_ps_plus_1(script_url) or "" + if ps1 != last_ps1: + script_ps1s.append(ps1) + last_ps1 = ps1 + + return ", ".join(script_ps1s) + + def add_col_bare_script_url(js_df): """Add a col for script URL without scheme, www and query.""" js_df['bare_script_url'] =\ @@ -167,14 +220,6 @@ def get_requests_from_visits(con, visit_ids): return read_sql_query(qry, con) -def get_set_of_script_ps1s_from_call_stack(script_urls): - if len(script_urls): - return ", ".join( - set((get_ps_plus_1(x) or "") for x in script_urls.split(", "))) - else: - return "" - - def add_col_set_of_script_ps1s_from_call_stack(js_df): js_df['stack_script_ps1s'] =\ js_df['stack_scripts'].map(get_set_of_script_ps1s_from_call_stack) diff --git a/setup.py b/setup.py index c5de21f..34dfd31 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ name='openwpm-utils', license='MPL 2.0', url='https://github.com/mozilla/openwpm-utils', - version='0.1.2', + version='0.1.3', packages=['openwpm_utils'], # Dependencies diff --git a/tests/test_analysis.py b/tests/test_analysis.py new file mode 100644 index 0000000..aa19f8b --- /dev/null +++ b/tests/test_analysis.py @@ -0,0 +1,73 @@ +from openwpm_utils.analysis import ( + get_ordered_script_urls_from_call_stack, + get_ordered_script_ps1s_from_call_stack +) + +HTTPS_SCHEME = "https://" + +STACK_JS_DOMAIN_1 = 'example-1.com' +STACK_JS_DOMAIN_2 = 'example-2.com' +STACK_JS_DOMAIN_3 = 'example-3.com' + +STACK_JS_URL_1 = HTTPS_SCHEME + STACK_JS_DOMAIN_1 +STACK_JS_URL_2 = HTTPS_SCHEME + STACK_JS_DOMAIN_2 +STACK_JS_URL_3 = HTTPS_SCHEME + STACK_JS_DOMAIN_3 + +SAMPLE_STACK_TRACE_1 =\ + "func@" + STACK_JS_URL_1 + ":1:2;null\n"\ + "func@" + STACK_JS_URL_2 + ":3:4;null\n"\ + "func@" + STACK_JS_URL_3 + ":5:6;null" + +SAMPLE_STACK_TRACE_2 =\ + "func@" + STACK_JS_URL_1 + ":1:2;null\n"\ + "func@" + STACK_JS_URL_2 + ":3:4;null\n"\ + "func@" + STACK_JS_URL_2 + ":3:4;null\n"\ + "func@" + STACK_JS_URL_2 + ":3:4;null\n"\ + "func@" + STACK_JS_URL_3 + ":5:6;null" + + +SAMPLE_STACK_TRACE_3 =\ + "func@" + STACK_JS_URL_1 + ":1:2;null\n"\ + "func@" + STACK_JS_URL_1 + ":1:2;null\n"\ + "func@" + STACK_JS_URL_3 + ":5:6;null\n"\ + "func@" + STACK_JS_URL_1 + ":1:2;null\n"\ + "func@" + STACK_JS_URL_2 + ":3:4;null\n"\ + "func@" + STACK_JS_URL_2 + ":3:4;null\n"\ + "func@" + STACK_JS_URL_2 + ":3:4;null\n" + + +EXPECTED_STACK_JS_URLS = ", ".join( + [STACK_JS_URL_1, STACK_JS_URL_2, STACK_JS_URL_3]) + +EXPECTED_STACK_JS_PS1S = ", ".join( + [STACK_JS_DOMAIN_1, STACK_JS_DOMAIN_2, STACK_JS_DOMAIN_3]) + +EXPECTED_STACK_JS_URLS_MIXED = ", ".join( + [STACK_JS_URL_1, STACK_JS_URL_3, + STACK_JS_URL_1, STACK_JS_URL_2]) + +EXPECTED_STACK_JS_PS1S_MIXED = ", ".join( + [STACK_JS_DOMAIN_1, STACK_JS_DOMAIN_3, + STACK_JS_DOMAIN_1, STACK_JS_DOMAIN_2]) + + +def test_get_ordered_script_urls_from_call_stack(): + assert get_ordered_script_urls_from_call_stack( + SAMPLE_STACK_TRACE_1) == EXPECTED_STACK_JS_URLS + + assert get_ordered_script_urls_from_call_stack( + SAMPLE_STACK_TRACE_2) == EXPECTED_STACK_JS_URLS + + assert get_ordered_script_urls_from_call_stack( + SAMPLE_STACK_TRACE_3) == EXPECTED_STACK_JS_URLS_MIXED + + +def test_get_ordered_script_ps1s_from_call_stack(): + assert get_ordered_script_ps1s_from_call_stack( + SAMPLE_STACK_TRACE_1) == EXPECTED_STACK_JS_PS1S + + assert get_ordered_script_ps1s_from_call_stack( + SAMPLE_STACK_TRACE_2) == EXPECTED_STACK_JS_PS1S + + assert get_ordered_script_ps1s_from_call_stack( + SAMPLE_STACK_TRACE_3) == EXPECTED_STACK_JS_PS1S_MIXED diff --git a/tests/test_domain.py b/tests/test_domain.py index b9d78ff..658a2a7 100644 --- a/tests/test_domain.py +++ b/tests/test_domain.py @@ -1,5 +1,5 @@ import pytest -from crawl_utils.domain import ( +from openwpm_utils.domain import ( get_ps_plus_1, )