From d51adff52d86d12862231bbd5dc98449080b7f31 Mon Sep 17 00:00:00 2001 From: aalok-sathe Date: Thu, 3 Nov 2022 14:24:08 -0500 Subject: [PATCH 01/10] pull files from the asathe-gretatu/pereira2018 branch to carry forward #37 --- .../benchmarks/pereira2018_v2022/__init__.py | 165 +++++++ .../pereira2018_v2022/ceiling_packaging.py | 405 ++++++++++++++++++ .../benchmarks/pereira2018_v2022/test.py | 114 +++++ .../data/pereira2018_v2022/__init__.py | 38 ++ .../data/pereira2018_v2022/data_packaging.py | 120 ++++++ .../data/pereira2018_v2022/test.py | 44 ++ 6 files changed, 886 insertions(+) create mode 100644 brainscore_language/benchmarks/pereira2018_v2022/__init__.py create mode 100644 brainscore_language/benchmarks/pereira2018_v2022/ceiling_packaging.py create mode 100644 brainscore_language/benchmarks/pereira2018_v2022/test.py create mode 100644 brainscore_language/data/pereira2018_v2022/__init__.py create mode 100644 brainscore_language/data/pereira2018_v2022/data_packaging.py create mode 100644 brainscore_language/data/pereira2018_v2022/test.py diff --git a/brainscore_language/benchmarks/pereira2018_v2022/__init__.py b/brainscore_language/benchmarks/pereira2018_v2022/__init__.py new file mode 100644 index 00000000..db1aa34a --- /dev/null +++ b/brainscore_language/benchmarks/pereira2018_v2022/__init__.py @@ -0,0 +1,165 @@ +import logging + +from brainio.assemblies import NeuroidAssembly +from brainscore_core.benchmarks import BenchmarkBase +from brainscore_core.metrics import Score +from brainscore_language import load_dataset, load_metric, benchmark_registry +from brainscore_language.artificial_subject import ArtificialSubject +from brainscore_language.utils.ceiling import ceiling_normalize +from brainscore_language.utils.s3 import load_from_s3 + +logger = logging.getLogger(__name__) + +BIBTEX = """@article{pereira2018toward, + title={Toward a universal decoder of linguistic meaning from brain activation}, + author={Pereira, Francisco and Lou, Bin and Pritchett, Brianna and Ritter, Samuel and Gershman, Samuel J + and Kanwisher, Nancy and Botvinick, Matthew and Fedorenko, Evelina}, + journal={Nature communications}, + volume={9}, + number={1}, + pages={1--13}, + year={2018}, + publisher={Nature Publishing Group} +}""" + + +def Pereira2018_243sentences(): + return _Pereira2018LinregPearsonr( + experiment="PereiraE3_72pass", + ceiling_s3_kwargs=dict( + sha1="64ebaded266506df698ed249685ffadabf017951", + version_id="H8Apri0513nJ8Qy54dp3IitpuLOL9UyK", + raw_kwargs=dict( + version_id="ws6mzNrk9AkN2rFP3vVzBcSSgF3n0J5O", + sha1="a72adbc813e1eaff5fbb3e4fcb9c1b202d3cdbf2", + raw_kwargs=dict( + version_id="LYWHrqobBWLYYKMwMz4bEUsB2s..9QTY", + sha1="0b5477ead370939a65b824629f72fad40da0543f", + ), + ), + ), + ) + + +def Pereira2018_384sentences(): + return _Pereira2018LinregPearsonr( + experiment="PereiraE2_96pass", + ceiling_s3_kwargs=dict( + sha1="638b7a0d7a08a851ace19af9489801c9c1a4c05b", + version_id="OFu.5imbaIbrJvD6xny_G55XfEzOU37o", + raw_kwargs=dict( + sha1="3348a8c489f5d091c5f4ded9d50f0c895e2368f4", + version_id="c6zAAvSRxerQ3kHRf0I7WnSoFZDSd3kG", + raw_kwargs=dict( + sha1="75a88348f30135bb0453459ba8eddf33eabf49ff", + version_id="cA3D7oKIcpx.qZz9tLOFO_kAkOduht4W", + ), + ), + ), + ) + + +class _Pereira2018LinregPearsonr(BenchmarkBase): + """ + Evaluate model ability to predict neural activity in the human language system in response to natural sentences, + recorded by Pereira et al. 2018. + Alignment of neural activity between model and human subjects is evaluated via cross-validated linear predictivity. + + This benchmark builds off the behavioral benchmark introduced + in Schrimpf et al. 2021 (https://www.pnas.org/doi/10.1073/pnas.2105646118), but: + + * computes neural alignment to each of the two experiments ({243,384}sentences) separately, as well as ceilings + * requires the model to have committed to neural readouts (e.g. layer 41 corresponds to the language system), + rather than testing every layer separately + + Each of these benchmarks evaluates one of the two experiments, the overall Pereira2018-linear score is the mean of + the two ceiling-normalized scores. + """ + + def __init__( + self, + experiment: str, + ceiling_s3_kwargs: dict, + ): + self.data = self._load_data(experiment) + self.metric = load_metric( + "linreg_pearsonr", + crossvalidation_kwargs={ + "split_coord": "sentence", + }, + regression_kwargs={ + "stimulus_coord": "sentence", + }, + correlation_kwargs={ + "correlation_coord": "sentence", + "neuroid_coord": "neuroid", + }, + ) + identifier = f"Pereira2018_v2022.{experiment}-linreg_pearsonr" + ceiling = self._load_ceiling(identifier=identifier, **ceiling_s3_kwargs) + super(_Pereira2018LinregPearsonr, self).__init__( + identifier=identifier, + ceiling_func=ceiling, + version=1, + parent="Pereira2018-linear", + bibtex=BIBTEX, + ) + + def _load_data(self, experiment: str) -> NeuroidAssembly: + data = load_dataset("Pereira2018_v2022.language") + # data = data.sel(experiment=experiment) # filter experiment + data.loc[data.experiment == experiment, :, :] + data = data.dropna( + "neuroid" + ) # not all subjects have done both experiments, drop those that haven't + data.attrs["identifier"] = f"{data.identifier}.{experiment}" + if "time" in data.dims: + data = data.drop("time").squeeze("time") + return data + + def _load_ceiling( + self, + identifier: str, + version_id: str, + sha1: str, + assembly_prefix="ceiling_", + raw_kwargs=None, + ): + ceiling = load_from_s3( + identifier, + cls=Score, + assembly_prefix=assembly_prefix, + version_id=version_id, + sha1=sha1, + ) + if raw_kwargs: # recursively load raw attributes + raw = self._load_ceiling( + identifier=identifier, + assembly_prefix=assembly_prefix + "raw_", + **raw_kwargs, + ) + ceiling.attrs["raw"] = raw + return ceiling + + def __call__(self, candidate: ArtificialSubject) -> Score: + candidate.perform_neural_recording( + recording_target=ArtificialSubject.RecordingTarget.language_system, + recording_type=ArtificialSubject.RecordingType.fMRI, + ) + stimuli = self.data["stimuli"] + predictions = candidate.digest_text(stimuli.values)["neural"] + predictions["presentation"] = "presentation", stimuli["sentence"].values + raw_score = self.metric( + predictions, + self.data, + ) + score = raw_score # ceiling_normalize(raw_score, self.ceiling) + return score + + +benchmark_registry[ + "Pereira2018_v2022.243sentences-linreg_pearsonr" +] = Pereira2018_243sentences +benchmark_registry[ + "Pereira2018_v2022.384sentences-linreg_pearsonr" +] = Pereira2018_384sentences diff --git a/brainscore_language/benchmarks/pereira2018_v2022/ceiling_packaging.py b/brainscore_language/benchmarks/pereira2018_v2022/ceiling_packaging.py new file mode 100644 index 00000000..27db0421 --- /dev/null +++ b/brainscore_language/benchmarks/pereira2018_v2022/ceiling_packaging.py @@ -0,0 +1,405 @@ +import logging +import sys +import warnings +from collections import defaultdict + +import numpy as np +from numpy import AxisError +from numpy.random import RandomState +from result_caching import store +from scipy.optimize import curve_fit +from tqdm import tqdm, trange + +from brainio.assemblies import ( + DataAssembly, + array_is_element, + walk_coords, + merge_data_arrays, +) +from brainscore_core.metrics import Score +from brainscore_language import load_benchmark +from brainscore_language.utils import fullname +from brainscore_language.utils.s3 import upload_data_assembly +from brainscore_language.utils.transformations import apply_aggregate + +_logger = logging.getLogger(__name__) + +""" +Because ceiling estimations for Pereira2018-linear benchmarks are very costly to run (many hours), we pre-compute them, +and store the resulting ceiling (including per-neuroid estimates) on S3. +The code in this file was run only once, and is kept here for reference. +""" + + +def upload_ceiling(experiment): + benchmark = load_benchmark(f"Pereira2018_v2022.{experiment}-linreg_pearsonr") + ceiler = ExtrapolationCeiling() + ceiling = ceiler(benchmark.data, metric=benchmark.metric) + _logger.info(f"Uploading ceiling {ceiling}") + # Note that because we cannot serialize complex objects to netCDF, attributes like 'raw' and 'bootstrapped_params' + # will get lost during upload + upload_data_assembly( + ceiling, assembly_identifier=benchmark.identifier, assembly_prefix="ceiling_" + ) + # also upload raw (and raw.raw) attributes + upload_data_assembly( + ceiling.raw, + assembly_identifier=benchmark.identifier, + assembly_prefix="ceiling_raw_", + ) + upload_data_assembly( + ceiling.raw.raw, + assembly_identifier=benchmark.identifier, + assembly_prefix="ceiling_raw_raw_", + ) + + +def v(x, v0, tau0): # function for ceiling extrapolation + return v0 * (1 - np.exp(-x / tau0)) + + +class HoldoutSubjectCeiling: + def __init__(self, subject_column): + self.subject_column = subject_column + self._logger = logging.getLogger(fullname(self)) + self._rng = RandomState(0) + self._num_bootstraps = 5 + + def __call__(self, assembly, metric): + subjects = set(assembly[self.subject_column].values) + scores = [] + iterate_subjects = self._rng.choice( + list(subjects), size=self._num_bootstraps + ) # use only a subset of subjects + for subject in tqdm(iterate_subjects, desc="heldout subject"): + try: + subject_assembly = assembly[ + { + "neuroid": [ + subject_value == subject + for subject_value in assembly[self.subject_column].values + ] + } + ] + # run subject pool as neural candidate + subject_pool = subjects - {subject} + pool_assembly = assembly[ + { + "neuroid": [ + subject in subject_pool + for subject in assembly[self.subject_column].values + ] + } + ] + score = metric(pool_assembly, subject_assembly) + # store scores + apply_raw = "raw" in score.attrs and not hasattr( + score.raw, self.subject_column + ) # only propagate if column not part of score + score = score.expand_dims(self.subject_column, _apply_raw=apply_raw) + score.__setitem__(self.subject_column, [subject], _apply_raw=apply_raw) + scores.append(score) + except NoOverlapException as e: + self._logger.debug(f"Ignoring no overlap {e}") + continue # ignore + except ValueError as e: + if "Found array with" in str(e): + self._logger.debug(f"Ignoring empty array {e}") + continue + else: + raise e + + scores = Score.merge(*scores) + scores = apply_aggregate( + lambda scores: scores.mean(self.subject_column), scores + ) + return scores + + +class ExtrapolationCeiling: + def __init__( + self, + subject_column="subject", + extrapolation_dimension="neuroid", + num_bootstraps=100, + ): + self._logger = logging.getLogger(fullname(self)) + self.subject_column = subject_column + self.extrapolation_dimension = extrapolation_dimension + self.num_bootstraps = num_bootstraps + self.num_subsamples = 10 + self.holdout_ceiling = HoldoutSubjectCeiling(subject_column=subject_column) + self._rng = RandomState(0) + + def __call__(self, assembly, metric): + scores = self.collect( + identifier=assembly.identifier, assembly=assembly, metric=metric + ) + return self.extrapolate(identifier=assembly.identifier, ceilings=scores) + + @store(identifier_ignore=["assembly", "metric"]) + def collect(self, identifier, assembly, metric): + self._logger.debug("Collecting data for extrapolation") + subjects = set(assembly[self.subject_column].values) + subject_subsamples = tuple(range(2, len(subjects) + 1)) + scores = [] + for num_subjects in tqdm(subject_subsamples, desc="num subjects"): + for sub_subjects in self._random_combinations( + subjects=set(assembly[self.subject_column].values), + num_subjects=num_subjects, + choice=self.num_subsamples, + rng=self._rng, + ): + sub_assembly = assembly[ + { + "neuroid": [ + subject in sub_subjects + for subject in assembly[self.subject_column].values + ] + } + ] + selections = {self.subject_column: sub_subjects} + try: + score = self.holdout_ceiling(assembly=sub_assembly, metric=metric) + score = score.expand_dims("num_subjects") + score["num_subjects"] = [num_subjects] + for key, selection in selections.items(): + expand_dim = f"sub_{key}" + score = score.expand_dims(expand_dim) + score[expand_dim] = [str(selection)] + scores.append(score.raw) + except KeyError as e: # nothing to merge + if str(e) == "'z'": + self._logger.debug(f"Ignoring merge error {e}") + continue + else: + raise e + scores = Score.merge(*scores) + return scores + + def _random_combinations(self, subjects, num_subjects, choice, rng): + # following https://stackoverflow.com/a/55929159/2225200. Also see similar method in `behavioral.py`. + subjects = np.array(list(subjects)) + combinations = set() + while len(combinations) < choice: + elements = rng.choice(subjects, size=num_subjects, replace=False) + combinations.add(tuple(elements)) + return combinations + + def extrapolate(self, identifier, ceilings): + neuroid_ceilings = [] + raw_keys = ["bootstrapped_params", "error_low", "error_high", "endpoint_x"] + raw_attrs = defaultdict(list) + for i in trange( + len(ceilings[self.extrapolation_dimension]), + desc=f"{self.extrapolation_dimension} extrapolations", + ): + try: + # extrapolate per-neuroid ceiling + neuroid_ceiling = ceilings.isel(**{self.extrapolation_dimension: [i]}) + extrapolated_ceiling = self.extrapolate_neuroid( + neuroid_ceiling.squeeze() + ) + extrapolated_ceiling = self.add_neuroid_meta( + extrapolated_ceiling, neuroid_ceiling + ) + neuroid_ceilings.append(extrapolated_ceiling) + # keep track of raw attributes + for key in raw_keys: + values = extrapolated_ceiling.attrs[key] + values = self.add_neuroid_meta(values, neuroid_ceiling) + raw_attrs[key].append(values) + except AxisError: # no extrapolation successful (happens for 1 neuroid in Pereira) + _logger.warning( + f"Failed to extrapolate neuroid ceiling {i}", exc_info=True + ) + continue + + # merge and add meta + self._logger.debug("Merging neuroid ceilings") + neuroid_ceilings = manual_merge( + *neuroid_ceilings, on=self.extrapolation_dimension + ) + neuroid_ceilings.attrs["raw"] = ceilings + + for key, values in raw_attrs.items(): + self._logger.debug(f"Merging {key}") + values = manual_merge(*values, on=self.extrapolation_dimension) + neuroid_ceilings.attrs[key] = values + # aggregate + ceiling = self.aggregate_neuroid_ceilings(neuroid_ceilings, raw_keys=raw_keys) + ceiling.attrs["identifier"] = identifier + return ceiling + + def add_neuroid_meta(self, target, source): + target = target.expand_dims(self.extrapolation_dimension) + for coord, dims, values in walk_coords(source): + if array_is_element(dims, self.extrapolation_dimension): + target[coord] = dims, values + return target + + def aggregate_neuroid_ceilings(self, neuroid_ceilings, raw_keys): + ceiling = neuroid_ceilings.median(self.extrapolation_dimension) + ceiling.attrs["raw"] = neuroid_ceilings + for key in raw_keys: + values = neuroid_ceilings.attrs[key] + aggregate = values.median(self.extrapolation_dimension) + if not aggregate.shape: # scalar value, e.g. for error_low + aggregate = aggregate.item() + ceiling.attrs[key] = aggregate + return ceiling + + def extrapolate_neuroid(self, ceilings): + # figure out how many extrapolation x points we have. E.g. for Pereira, not all combinations are possible + subject_subsamples = list(sorted(set(ceilings["num_subjects"].values))) + rng = RandomState(0) + bootstrap_params = [] + for bootstrap in range(self.num_bootstraps): + bootstrapped_scores = [] + for num_subjects in subject_subsamples: + num_scores = ceilings.sel(num_subjects=num_subjects) + # the sub_subjects dimension creates nans, get rid of those + num_scores = num_scores.dropna(f"sub_{self.subject_column}") + assert set(num_scores.dims) == { + f"sub_{self.subject_column}", + "split", + } or set(num_scores.dims) == {f"sub_{self.subject_column}"} + # choose from subject subsets and the splits therein, with replacement for variance + choices = num_scores.values.flatten() + bootstrapped_score = rng.choice( + choices, size=len(choices), replace=True + ) + bootstrapped_scores.append(np.mean(bootstrapped_score)) + + try: + params = self.fit(subject_subsamples, bootstrapped_scores) + except RuntimeError: # optimal parameters not found + params = [np.nan, np.nan] + params = DataAssembly( + [params], + coords={"bootstrap": [bootstrap], "param": ["v0", "tau0"]}, + dims=["bootstrap", "param"], + ) + bootstrap_params.append(params) + bootstrap_params = merge_data_arrays(bootstrap_params) + # find endpoint and error + asymptote_threshold = 0.0005 + interpolation_xs = np.arange(1000) + ys = np.array( + [ + v(interpolation_xs, *params) + for params in bootstrap_params.values + if not np.isnan(params).any() + ] + ) + median_ys = np.median(ys, axis=0) + diffs = np.diff(median_ys) + end_x = np.where(diffs < asymptote_threshold)[ + 0 + ].min() # first x where increase smaller than threshold + # put together + center = np.median(np.array(bootstrap_params)[:, 0]) + error_low, error_high = ci_error(ys[:, end_x], center=center) + score = Score(center) + score.attrs["raw"] = ceilings + score.attrs["error_low"] = DataAssembly(error_low) + score.attrs["error_high"] = DataAssembly(error_high) + score.attrs["bootstrapped_params"] = bootstrap_params + score.attrs["endpoint_x"] = DataAssembly(end_x) + return score + + def fit(self, subject_subsamples, bootstrapped_scores): + valid = ~np.isnan(bootstrapped_scores) + if sum(valid) < 1: + raise RuntimeError("No valid scores in sample") + params, pcov = curve_fit( + v, + subject_subsamples, + bootstrapped_scores, + # v (i.e. max ceiling) is between 0 and 1, tau0 unconstrained + bounds=([0, -np.inf], [1, np.inf]), + ) + return params + + +def manual_merge(*elements, on="neuroid"): + dims = elements[0].dims + assert all(element.dims == dims for element in elements[1:]) + merge_index = dims.index(on) + # the coordinates in the merge index should have the same keys + assert _coords_match( + elements, dim=on, match_values=False + ), f"coords in {[element[on] for element in elements]} do not match" + # all other dimensions, their coordinates and values should already align + for dim in set(dims) - {on}: + assert _coords_match( + elements, dim=dim, match_values=True + ), f"coords in {[element[dim] for element in elements]} do not match" + # merge values without meta + merged_values = np.concatenate( + [element.values for element in elements], axis=merge_index + ) + # piece together with meta + result = type(elements[0])( + merged_values, + coords={ + **{ + coord: (dims, values) + for coord, dims, values in walk_coords(elements[0]) + if not array_is_element(dims, on) + }, + **{ + coord: ( + dims, + np.concatenate([element[coord].values for element in elements]), + ) + for coord, dims, _ in walk_coords(elements[0]) + if array_is_element(dims, on) + }, + }, + dims=elements[0].dims, + ) + return result + + +def _coords_match(elements, dim, match_values=False): + first_coords = [ + (key, tuple(value)) if match_values else key + for _, key, value in walk_coords(elements[0][dim]) + ] + other_coords = [ + [ + (key, tuple(value)) if match_values else key + for _, key, value in walk_coords(element[dim]) + ] + for element in elements[1:] + ] + return all(tuple(first_coords) == tuple(coords) for coords in other_coords) + + +def ci_error(samples, center, confidence=0.95): + low, high = 100 * ((1 - confidence) / 2), 100 * (1 - ((1 - confidence) / 2)) + confidence_below, confidence_above = np.nanpercentile( + samples, low + ), np.nanpercentile(samples, high) + confidence_below, confidence_above = ( + center - confidence_below, + confidence_above - center, + ) + return confidence_below, confidence_above + + +class NoOverlapException(Exception): + pass + + +if __name__ == "__main__": + logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) + for shush_logger in ["botocore", "boto3", "s3transfer", "urllib3"]: + logging.getLogger(shush_logger).setLevel(logging.INFO) + warnings.filterwarnings( + "ignore" + ) # suppress RuntimeWarning from extrapolation overflows + + upload_ceiling("243sentences") + upload_ceiling("384sentences") diff --git a/brainscore_language/benchmarks/pereira2018_v2022/test.py b/brainscore_language/benchmarks/pereira2018_v2022/test.py new file mode 100644 index 00000000..56f6bb98 --- /dev/null +++ b/brainscore_language/benchmarks/pereira2018_v2022/test.py @@ -0,0 +1,114 @@ +import copy + +import numpy as np +import pytest +from numpy.random import RandomState +from pytest import approx + +from brainio.assemblies import NeuroidAssembly +from brainscore_language import load_dataset, ArtificialSubject, load_benchmark + + +class TestBenchmark: + class DummyModel(ArtificialSubject): + def __init__(self, neural_activity): + self.neural_activity = neural_activity + + def digest_text(self, stimuli): + assert len(stimuli) == len(self.neural_activity["presentation"]) + self.neural_activity["sentence"] = "presentation", stimuli # copy over + return {"neural": self.neural_activity} + + def perform_neural_recording( + self, + recording_target: ArtificialSubject.RecordingTarget, + recording_type: ArtificialSubject.RecordingType, + ): + assert recording_target == ArtificialSubject.RecordingTarget.language_system + assert recording_type == ArtificialSubject.RecordingType.fMRI + + @pytest.mark.parametrize( + "experiment, expected_score", + [ + (243, approx(0.0017534 / 0.35378928, abs=0.001)), + (384, approx(0.01215216 / 0.36343748, abs=0.001)), + ], + ) + def test_dummy_bad(self, experiment, expected_score): + benchmark = load_benchmark( + f"Pereira2018_v2022.{experiment}sentences-linreg_pearsonr" + ) + neural_activity = RandomState(0).random( + size=(experiment, 25) + ) # presentation x neuroid + neural_activity = NeuroidAssembly( + neural_activity, + coords={ + "stimulus_seq": ("presentation", np.arange(experiment)), + "stimulus_num": ("presentation", np.arange(experiment)), + "neuroid_id": ("neuroid", np.arange(25)), + "region": ("neuroid", ["some_region"] * 25), + }, + dims=["presentation", "neuroid"], + ) + dummy_model = TestBenchmark.DummyModel(neural_activity=neural_activity) + score = benchmark(dummy_model) + assert score == expected_score + + @pytest.mark.parametrize( + "experiment", + [ + 243, + 384, + ], + ) + def test_exact(self, experiment): + benchmark = load_benchmark( + f"Pereira2018_v2022.{experiment}sentences-linreg_pearsonr" + ) + + # TODO: we are lazy-dropping benchark.data timeid dim, but drop in packaging or accommodate donwstream later? + benchmark.data = benchmark.data.squeeze(dim="time", drop=True) + + exact_data = copy.deepcopy(benchmark.data) # .reset_index("presentation") + # exact_data = exact_data.drop_vars("sentence") + del (exact_data["presentation"],) + # exact_data = exact_data.set_index( + # presentation=list(exact_data["presentation"].coords) + # ) + dummy_model = TestBenchmark.DummyModel(neural_activity=exact_data) + score = benchmark(dummy_model) + print(score) + assert score == approx(1) + + # @pytest.mark.parametrize( + # "experiment, expected_ceiling", + # [ + # (243, 0.35378928), + # (384, 0.36343748), + # ], + # ) + # def test_ceiling(self, experiment, expected_ceiling): + # benchmark = load_benchmark(f"Pereira2018.{experiment}sentences-linear") + # ceiling = benchmark.ceiling + # assert ceiling == approx(expected_ceiling, abs=0.0005) + + # @pytest.mark.parametrize("experiment", [243, 384]) + # def test_ceiling_raw(self, experiment): + # benchmark = load_benchmark(f"Pereira2018.{experiment}sentences-linear") + # ceiling = benchmark.ceiling + # assert hasattr(ceiling, "raw") + # assert set(ceiling.raw.dims) == {"neuroid"} + # assert ceiling.raw.median() == ceiling + # assert hasattr(ceiling.raw, "raw") + # assert set(ceiling.raw.raw.dims) == { + # "sub_subject", + # "num_subjects", + # "split", + # "neuroid", + # } + + +if __name__ == "__main__": + test = TestBenchmark() + test.test_exact(243) diff --git a/brainscore_language/data/pereira2018_v2022/__init__.py b/brainscore_language/data/pereira2018_v2022/__init__.py new file mode 100644 index 00000000..a514988b --- /dev/null +++ b/brainscore_language/data/pereira2018_v2022/__init__.py @@ -0,0 +1,38 @@ +import logging + +from brainscore_language import data_registry + +from brainscore_language.utils.s3 import load_from_s3 +from brainscore_language.utils.local import load_from_disk + +_logger = logging.getLogger(__name__) + +BIBTEX = """@article{pereira2018toward, + title={Toward a universal decoder of linguistic meaning from brain activation}, + author={Pereira, Francisco and Lou, Bin and Pritchett, Brianna and Ritter, Samuel and Gershman, Samuel J + and Kanwisher, Nancy and Botvinick, Matthew and Fedorenko, Evelina}, + journal={Nature communications}, + volume={9}, + number={1}, + pages={1--13}, + year={2018}, + publisher={Nature Publishing Group} +}""" + + +try: + data_registry["Pereira2018_v2022.language"] = lambda: load_from_s3( + identifier="Pereira2018ROI", + # version_id="8kEzAXaWnxjMIC95MrhSJ3gtP2a1W.Y7", + version_id="i1XqPs72b82kfzj9kZ9EQwVZjLltmlSz", + # sha1="235ecf1f2abc38cb892c0a7b5c9c465434f153c5", + sha1="d1dc23f2157cfcf6f78abf781a92480cd919ad1c", + assembly_prefix="assembly_", + ) +except Exception as e: + # TODO: check what class of Exception is thrown by S3 object not found, and catch that + # while we don't know, we will continue to throw the exception rather than handling it + raise e + data_registry["Pereira2018_v2022.language"] = lambda: load_from_disk( + "Pereira2018ROI" + ) diff --git a/brainscore_language/data/pereira2018_v2022/data_packaging.py b/brainscore_language/data/pereira2018_v2022/data_packaging.py new file mode 100644 index 00000000..5ee049fb --- /dev/null +++ b/brainscore_language/data/pereira2018_v2022/data_packaging.py @@ -0,0 +1,120 @@ +import logging +import re +import sys +from pathlib import Path + +import numpy as np +import pandas as pd + +from brainio import fetch +from brainio.assemblies import NeuroidAssembly +from brainio.packaging import upload_to_s3, write_netcdf +from brainscore_core.benchmarks import BenchmarkBase +from brainscore_core.metrics import Score +from brainscore_language import benchmark_registry, load_dataset, load_metric +from brainscore_language.artificial_subject import ArtificialSubject +from brainscore_language.utils.ceiling import ceiling_normalize +from brainscore_language.utils.s3 import load_from_s3 +from tqdm import tqdm + +_logger = logging.getLogger(__name__) + +""" +The code in this file is intended to be run only once to initially upload the data, and is kept here +for reference. +""" + + +def package_Pereira2018ROI(): + """ + Load Pereira2018ROI benchmark csv file and package it into an xarray benchmark. + """ + data_path = Path(__file__).parent + data_file = ( + data_path / "Pereira2018_Lang_fROI.csv" + ) # contains both neural data and associated stimuli. + _logger.info(f"Data file: {data_file}.") + + # get data + data = pd.read_csv(data_file) + data["Neuroid"] = data.UID.astype(str) + "_" + data.ROI + + neuroid_id_map = dict(zip(set(data.Neuroid), np.arange(len(set(data.Neuroid))))) + presentation_id_map = dict( + zip(set(data["Sentence"]), np.arange(len(set(data["Sentence"])))), + ) + + presentation_arr = [None] * len(presentation_id_map) + experiment_arr = [None] * len(presentation_id_map) + neuroid_arr = [None] * len(neuroid_id_map) + subject_arr = [None] * len(neuroid_id_map) + + # inv_neuroid_map = {v: k for k, v in neuroid_id_map} + # inv_presentation_map = {v: k for k, v in presentation_id_map} + + effect_sizes = np.array(data["EffectSize"]) + effect_sizes_placeholder = np.array( + [np.nan for _ in neuroid_id_map for _ in presentation_id_map] + ).reshape(len(presentation_id_map), len(neuroid_id_map)) + + for ix, row in tqdm( + data.iterrows(), total=len(data), desc=f"iterating through {data_file}" + ): + neuroid = row["Neuroid"] + presentation = row["Sentence"] + neuroid_id = neuroid_id_map[neuroid] + presentation_id = presentation_id_map[presentation] + + effect_sizes_placeholder[presentation_id, neuroid_id] = row["EffectSize"] + + presentation_arr[presentation_id] = presentation + experiment_arr[presentation_id] = row["Experiment"] + subject_arr[neuroid_id] = row["UID"] + neuroid_arr[neuroid_id] = neuroid + + assembly = NeuroidAssembly( + effect_sizes_placeholder.reshape(*effect_sizes_placeholder.shape, 1), + dims=("presentation", "neuroid", "time"), # ? added time + coords={ + "stimuli": ("presentation", presentation_arr), + "experiment": ("presentation", experiment_arr), + "subject": ("neuroid", subject_arr), + "roi": ("neuroid", neuroid_arr), + "time": ("time", [0]), + }, + ) + + # upload + upload_data_assembly( + assembly, + assembly_identifier="Pereira2018ROI", + bucket_name="brainscore-language", + ) + + +def upload_data_assembly(assembly, assembly_identifier, bucket_name): + # adapted from + # https://github.com/mschrimpf/brainio/blob/8a40a3558d0b86072b9e221808f19005c7cb8c17/brainio/packaging.py#L217 + + _logger.debug(f"Uploading {assembly_identifier} to S3") + + # identifiers + assembly_store_identifier = "assembly_" + assembly_identifier.replace(".", "_") + netcdf_file_name = assembly_store_identifier + ".nc" + target_netcdf_path = ( + Path(fetch.get_local_data_path()) / assembly_store_identifier / netcdf_file_name + ) + s3_key = netcdf_file_name + + # write to disk and upload + netcdf_kf_sha1 = write_netcdf(assembly, target_netcdf_path) + _logger.debug(f"Wrote file to {target_netcdf_path}.") + upload_to_s3(target_netcdf_path, bucket_name, s3_key) + _logger.debug( + f"Uploaded assembly {assembly_identifier} to S3: {s3_key} (SHA1 hash {netcdf_kf_sha1})" + ) + + +if __name__ == "__main__": + logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) + package_Pereira2018ROI() diff --git a/brainscore_language/data/pereira2018_v2022/test.py b/brainscore_language/data/pereira2018_v2022/test.py new file mode 100644 index 00000000..74dd30db --- /dev/null +++ b/brainscore_language/data/pereira2018_v2022/test.py @@ -0,0 +1,44 @@ +import copy +import logging + +import numpy as np +import pytest +from numpy.random import RandomState +from pytest import approx + +from brainio.assemblies import NeuroidAssembly +from brainscore_language import load_dataset, ArtificialSubject, load_benchmark + +_logger = logging.getLogger(__name__) + + +class TestData: + def test_language(self): + assembly = load_dataset("Pereira2018_v2022.language") + assert set(assembly["experiment"].values) == { + "PereiraE2_96pass", + "PereiraE3_72pass", + } + _logger.info("experiment names match up!") + + assert len(assembly["presentation"]) == 243 + 384 + _logger.info(f'no. of presentation IDs == {len(assembly["presentation"])}') + + assert len(set(assembly["sentence"].values)) == 243 + 384 + _logger.info(f'no. of stimuli == {len(assembly["sentence"].values)}') + + assert ( + "The concert pianist went blind in adulthood." + in assembly["sentence"].values + ) + _logger.info( + f'stimulus "The concert pianist went blind in adulthood." is present!' + ) + + assert len(set(assembly["subject"].values)) == 10 + _logger.info(f'no. of subjects == {len(set(assembly["subject"].values))}') + + assert ( + len(set(assembly["neuroid"].values)) == 120 + ) # potential TODO: rename to neuroid_id + _logger.info(f'no. of neuroids == {len(set(assembly["neuroid"].values))}') From c97f71e164f041ee300acaf406f973ea53e197f0 Mon Sep 17 00:00:00 2001 From: aalok-sathe Date: Thu, 3 Nov 2022 17:13:28 -0500 Subject: [PATCH 02/10] once more fix kwarg `ceiling_func` --- .../benchmarks/futrell2018/benchmark.py | 63 +++++---- .../benchmarks/pereira2018/benchmark.py | 123 ++++++++++++------ 2 files changed, 120 insertions(+), 66 deletions(-) diff --git a/brainscore_language/benchmarks/futrell2018/benchmark.py b/brainscore_language/benchmarks/futrell2018/benchmark.py index 6deec995..924f9574 100644 --- a/brainscore_language/benchmarks/futrell2018/benchmark.py +++ b/brainscore_language/benchmarks/futrell2018/benchmark.py @@ -25,27 +25,30 @@ class Futrell2018Pearsonr(BenchmarkBase): """ def __init__(self): - self.data = load_dataset('Futrell2018') - self.metric = load_metric('pearsonr') - ceiler = SplitHalvesConsistency(num_splits=10, split_coordinate='subject_id', consistency_metric=self.metric) + self.data = load_dataset("Futrell2018") + self.metric = load_metric("pearsonr") + ceiler = SplitHalvesConsistency( + num_splits=10, split_coordinate="subject_id", consistency_metric=self.metric + ) ceiling = ceiler(self.data) super(Futrell2018Pearsonr, self).__init__( - identifier='Futrell2018-pearsonr', + identifier="Futrell2018-pearsonr", version=1, - parent='behavior', - ceiling=ceiling, - bibtex=self.data.bibtex) + parent="behavior", + ceiling_func=ceiling, + bibtex=self.data.bibtex, + ) def __call__(self, candidate: ArtificialSubject) -> Score: # run experiment candidate.start_behavioral_task(ArtificialSubject.Task.reading_times) - stimuli = self.data['word'].values - predictions = candidate.digest_text(stimuli)['behavior'] - attach_presentation_meta(predictions, self.data['presentation']) + stimuli = self.data["word"].values + predictions = candidate.digest_text(stimuli)["behavior"] + attach_presentation_meta(predictions, self.data["presentation"]) # exclude first words - predictions = predictions[predictions['word_within_sentence_id'] != 1] - targets = self.data[self.data['word_within_sentence_id'] != 1] - targets = targets.mean('subject') # compare to "average human" + predictions = predictions[predictions["word_within_sentence_id"] != 1] + targets = self.data[self.data["word_within_sentence_id"] != 1] + targets = targets.mean("subject") # compare to "average human" # score raw_score = self.metric(predictions, targets) score = ceiling_normalize(raw_score, self.ceiling) @@ -56,7 +59,9 @@ class SplitHalvesConsistency: # following # https://github.com/brain-score/brain-score/blob/c51b8aa2c94212a9ac56c06c556afad0bb0a3521/brainscore/metrics/ceiling.py#L25-L96 - def __init__(self, num_splits: int, split_coordinate: str, consistency_metric: Metric): + def __init__( + self, num_splits: int, split_coordinate: str, consistency_metric: Metric + ): """ :param num_splits: how many times to create two halves :param split_coordinate: over which coordinate to split the assembly into halves @@ -73,18 +78,30 @@ def __call__(self, assembly: DataAssembly) -> Score: consistencies, uncorrected_consistencies = [], [] splits = range(self.num_splits) for _ in splits: - half1_values = random_state.choice(split_values, size=len(split_values) // 2, replace=False) - half2_values = set(split_values) - set(half1_values) # this only works because of `replace=False` above - half1 = assembly[{split_dim: [value in half1_values for value in split_values]}].mean(split_dim) - half2 = assembly[{split_dim: [value in half2_values for value in split_values]}].mean(split_dim) + half1_values = random_state.choice( + split_values, size=len(split_values) // 2, replace=False + ) + half2_values = set(split_values) - set( + half1_values + ) # this only works because of `replace=False` above + half1 = assembly[ + {split_dim: [value in half1_values for value in split_values]} + ].mean(split_dim) + half2 = assembly[ + {split_dim: [value in half2_values for value in split_values]} + ].mean(split_dim) consistency = self.consistency_metric(half1, half2) uncorrected_consistencies.append(consistency) # Spearman-Brown correction for sub-sampling corrected_consistency = 2 * consistency / (1 + (2 - 1) * consistency) consistencies.append(corrected_consistency) - consistencies = Score(consistencies, coords={'split': splits}, dims=['split']) - uncorrected_consistencies = Score(uncorrected_consistencies, coords={'split': splits}, dims=['split']) - average_consistency = consistencies.median('split') - average_consistency.attrs['raw'] = consistencies - average_consistency.attrs['uncorrected_consistencies'] = uncorrected_consistencies + consistencies = Score(consistencies, coords={"split": splits}, dims=["split"]) + uncorrected_consistencies = Score( + uncorrected_consistencies, coords={"split": splits}, dims=["split"] + ) + average_consistency = consistencies.median("split") + average_consistency.attrs["raw"] = consistencies + average_consistency.attrs[ + "uncorrected_consistencies" + ] = uncorrected_consistencies return average_consistency diff --git a/brainscore_language/benchmarks/pereira2018/benchmark.py b/brainscore_language/benchmarks/pereira2018/benchmark.py index 925f3880..caaeda5e 100644 --- a/brainscore_language/benchmarks/pereira2018/benchmark.py +++ b/brainscore_language/benchmarks/pereira2018/benchmark.py @@ -25,33 +25,39 @@ def Pereira2018_243sentences(): - return _Pereira2018ExperimentLinear(experiment='243sentences', ceiling_s3_kwargs=dict( - version_id='CHl_9aFHIWVnPW_njePfy28yzggKuUPw', - sha1='5e23de899883828f9c886aec304bc5aa0f58f66c', - raw_kwargs=dict( - version_id='uZye03ENmn.vKB5mARUGhcIY_DjShtPD', - sha1='525a6ac8c14ad826c63fdd71faeefb8ba542d5ac', + return _Pereira2018ExperimentLinear( + experiment="243sentences", + ceiling_s3_kwargs=dict( + version_id="CHl_9aFHIWVnPW_njePfy28yzggKuUPw", + sha1="5e23de899883828f9c886aec304bc5aa0f58f66c", raw_kwargs=dict( - version_id='XVTo58Po5YrNjTuDIWrmfHI0nbN2MVZa', - sha1='34ba453dc7e8a19aed18cc9bca160e97b4a80be5' - ) - ) - )) + version_id="uZye03ENmn.vKB5mARUGhcIY_DjShtPD", + sha1="525a6ac8c14ad826c63fdd71faeefb8ba542d5ac", + raw_kwargs=dict( + version_id="XVTo58Po5YrNjTuDIWrmfHI0nbN2MVZa", + sha1="34ba453dc7e8a19aed18cc9bca160e97b4a80be5", + ), + ), + ), + ) def Pereira2018_384sentences(): - return _Pereira2018ExperimentLinear(experiment='384sentences', ceiling_s3_kwargs=dict( - version_id='sjlnXr5wXUoGv6exoWu06C4kYI0KpZLk', - sha1='fc895adc52fd79cea3040961d65d8f736a9d3e29', - raw_kwargs=dict( - version_id='Hi74r9UKfpK0h0Bjf5DL.JgflGoaknrA', - sha1='ce2044a7713426870a44131a99bfc63d8843dae0', + return _Pereira2018ExperimentLinear( + experiment="384sentences", + ceiling_s3_kwargs=dict( + version_id="sjlnXr5wXUoGv6exoWu06C4kYI0KpZLk", + sha1="fc895adc52fd79cea3040961d65d8f736a9d3e29", raw_kwargs=dict( - version_id='m4dq_ouKWZkYtdyNPMSP0p6rqb7wcYpi', - sha1='fe9fb24b34fd5602e18e34006ac5ccc7d4c825b8' - ) - ) - )) + version_id="Hi74r9UKfpK0h0Bjf5DL.JgflGoaknrA", + sha1="ce2044a7713426870a44131a99bfc63d8843dae0", + raw_kwargs=dict( + version_id="m4dq_ouKWZkYtdyNPMSP0p6rqb7wcYpi", + sha1="fe9fb24b34fd5602e18e34006ac5ccc7d4c825b8", + ), + ), + ), + ) class _Pereira2018ExperimentLinear(BenchmarkBase): @@ -73,43 +79,74 @@ class _Pereira2018ExperimentLinear(BenchmarkBase): def __init__(self, experiment: str, ceiling_s3_kwargs: dict): self.data = self._load_data(experiment) - self.metric = load_metric('linear_pearsonr') - identifier = f'Pereira2018.{experiment}-linear' + self.metric = load_metric("linear_pearsonr") + identifier = f"Pereira2018.{experiment}-linear" ceiling = self._load_ceiling(identifier=identifier, **ceiling_s3_kwargs) super(_Pereira2018ExperimentLinear, self).__init__( identifier=identifier, version=1, - parent='Pereira2018-linear', - ceiling=ceiling, - bibtex=BIBTEX) + parent="Pereira2018-linear", + ceiling_func=ceiling, + bibtex=BIBTEX, + ) def _load_data(self, experiment: str) -> NeuroidAssembly: - data = load_dataset('Pereira2018.language') + data = load_dataset("Pereira2018.language") data = data.sel(experiment=experiment) # filter experiment - data = data.dropna('neuroid') # not all subjects have done both experiments, drop those that haven't - data.attrs['identifier'] = f"{data.identifier}.{experiment}" + data = data.dropna( + "neuroid" + ) # not all subjects have done both experiments, drop those that haven't + data.attrs["identifier"] = f"{data.identifier}.{experiment}" return data - def _load_ceiling(self, identifier: str, version_id: str, sha1: str, assembly_prefix="ceiling_", raw_kwargs=None): - ceiling = load_from_s3(identifier, cls=Score, assembly_prefix=assembly_prefix, version_id=version_id, sha1=sha1) + def _load_ceiling( + self, + identifier: str, + version_id: str, + sha1: str, + assembly_prefix="ceiling_", + raw_kwargs=None, + ): + ceiling = load_from_s3( + identifier, + cls=Score, + assembly_prefix=assembly_prefix, + version_id=version_id, + sha1=sha1, + ) if raw_kwargs: # recursively load raw attributes - raw = self._load_ceiling(identifier=identifier, assembly_prefix=assembly_prefix + "raw_", **raw_kwargs) - ceiling.attrs['raw'] = raw + raw = self._load_ceiling( + identifier=identifier, + assembly_prefix=assembly_prefix + "raw_", + **raw_kwargs, + ) + ceiling.attrs["raw"] = raw return ceiling def __call__(self, candidate: ArtificialSubject) -> Score: - candidate.start_neural_recording(recording_target=ArtificialSubject.RecordingTarget.language_system, - recording_type=ArtificialSubject.RecordingType.fMRI) - stimuli = self.data['stimulus'] - passages = self.data['passage_label'].values + candidate.start_neural_recording( + recording_target=ArtificialSubject.RecordingTarget.language_system, + recording_type=ArtificialSubject.RecordingType.fMRI, + ) + stimuli = self.data["stimulus"] + passages = self.data["passage_label"].values predictions = [] - for passage in sorted(set(passages)): # go over individual passages, sorting to keep consistency across runs - passage_indexer = [stimulus_passage == passage for stimulus_passage in passages] + for passage in sorted( + set(passages) + ): # go over individual passages, sorting to keep consistency across runs + passage_indexer = [ + stimulus_passage == passage for stimulus_passage in passages + ] passage_stimuli = stimuli[passage_indexer] - passage_predictions = candidate.digest_text(passage_stimuli.values)['neural'] - passage_predictions['stimulus_id'] = 'presentation', passage_stimuli['stimulus_id'].values + passage_predictions = candidate.digest_text(passage_stimuli.values)[ + "neural" + ] + passage_predictions["stimulus_id"] = ( + "presentation", + passage_stimuli["stimulus_id"].values, + ) predictions.append(passage_predictions) - predictions = xr.concat(predictions, dim='presentation') + predictions = xr.concat(predictions, dim="presentation") raw_score = self.metric(predictions, self.data) score = ceiling_normalize(raw_score, self.ceiling) return score From 7cfd655370037635f78090db06a0cf8f099bfcf9 Mon Sep 17 00:00:00 2001 From: aalok-sathe Date: Fri, 4 Nov 2022 09:27:44 -0500 Subject: [PATCH 03/10] migrate packaging part to `benchmark.py` file; format file so import_plugin() can actually parse it (ref. https://github.com/brain-score/language/commit/4f712a237eabcab77a4f24f685a374e6221603e0#r88882805) --- .../benchmarks/pereira2018_v2022/__init__.py | 168 +----------------- .../benchmarks/pereira2018_v2022/benchmark.py | 157 ++++++++++++++++ 2 files changed, 161 insertions(+), 164 deletions(-) create mode 100644 brainscore_language/benchmarks/pereira2018_v2022/benchmark.py diff --git a/brainscore_language/benchmarks/pereira2018_v2022/__init__.py b/brainscore_language/benchmarks/pereira2018_v2022/__init__.py index db1aa34a..aa4556f2 100644 --- a/brainscore_language/benchmarks/pereira2018_v2022/__init__.py +++ b/brainscore_language/benchmarks/pereira2018_v2022/__init__.py @@ -1,165 +1,5 @@ -import logging +from .benchmark import Pereira2018_243sentences, Pereira2018_384sentences +from brainscore_language import benchmark_registry -from brainio.assemblies import NeuroidAssembly -from brainscore_core.benchmarks import BenchmarkBase -from brainscore_core.metrics import Score -from brainscore_language import load_dataset, load_metric, benchmark_registry -from brainscore_language.artificial_subject import ArtificialSubject -from brainscore_language.utils.ceiling import ceiling_normalize -from brainscore_language.utils.s3 import load_from_s3 - -logger = logging.getLogger(__name__) - -BIBTEX = """@article{pereira2018toward, - title={Toward a universal decoder of linguistic meaning from brain activation}, - author={Pereira, Francisco and Lou, Bin and Pritchett, Brianna and Ritter, Samuel and Gershman, Samuel J - and Kanwisher, Nancy and Botvinick, Matthew and Fedorenko, Evelina}, - journal={Nature communications}, - volume={9}, - number={1}, - pages={1--13}, - year={2018}, - publisher={Nature Publishing Group} -}""" - - -def Pereira2018_243sentences(): - return _Pereira2018LinregPearsonr( - experiment="PereiraE3_72pass", - ceiling_s3_kwargs=dict( - sha1="64ebaded266506df698ed249685ffadabf017951", - version_id="H8Apri0513nJ8Qy54dp3IitpuLOL9UyK", - raw_kwargs=dict( - version_id="ws6mzNrk9AkN2rFP3vVzBcSSgF3n0J5O", - sha1="a72adbc813e1eaff5fbb3e4fcb9c1b202d3cdbf2", - raw_kwargs=dict( - version_id="LYWHrqobBWLYYKMwMz4bEUsB2s..9QTY", - sha1="0b5477ead370939a65b824629f72fad40da0543f", - ), - ), - ), - ) - - -def Pereira2018_384sentences(): - return _Pereira2018LinregPearsonr( - experiment="PereiraE2_96pass", - ceiling_s3_kwargs=dict( - sha1="638b7a0d7a08a851ace19af9489801c9c1a4c05b", - version_id="OFu.5imbaIbrJvD6xny_G55XfEzOU37o", - raw_kwargs=dict( - sha1="3348a8c489f5d091c5f4ded9d50f0c895e2368f4", - version_id="c6zAAvSRxerQ3kHRf0I7WnSoFZDSd3kG", - raw_kwargs=dict( - sha1="75a88348f30135bb0453459ba8eddf33eabf49ff", - version_id="cA3D7oKIcpx.qZz9tLOFO_kAkOduht4W", - ), - ), - ), - ) - - -class _Pereira2018LinregPearsonr(BenchmarkBase): - """ - Evaluate model ability to predict neural activity in the human language system in response to natural sentences, - recorded by Pereira et al. 2018. - Alignment of neural activity between model and human subjects is evaluated via cross-validated linear predictivity. - - This benchmark builds off the behavioral benchmark introduced - in Schrimpf et al. 2021 (https://www.pnas.org/doi/10.1073/pnas.2105646118), but: - - * computes neural alignment to each of the two experiments ({243,384}sentences) separately, as well as ceilings - * requires the model to have committed to neural readouts (e.g. layer 41 corresponds to the language system), - rather than testing every layer separately - - Each of these benchmarks evaluates one of the two experiments, the overall Pereira2018-linear score is the mean of - the two ceiling-normalized scores. - """ - - def __init__( - self, - experiment: str, - ceiling_s3_kwargs: dict, - ): - self.data = self._load_data(experiment) - self.metric = load_metric( - "linreg_pearsonr", - crossvalidation_kwargs={ - "split_coord": "sentence", - }, - regression_kwargs={ - "stimulus_coord": "sentence", - }, - correlation_kwargs={ - "correlation_coord": "sentence", - "neuroid_coord": "neuroid", - }, - ) - identifier = f"Pereira2018_v2022.{experiment}-linreg_pearsonr" - ceiling = self._load_ceiling(identifier=identifier, **ceiling_s3_kwargs) - super(_Pereira2018LinregPearsonr, self).__init__( - identifier=identifier, - ceiling_func=ceiling, - version=1, - parent="Pereira2018-linear", - bibtex=BIBTEX, - ) - - def _load_data(self, experiment: str) -> NeuroidAssembly: - data = load_dataset("Pereira2018_v2022.language") - # data = data.sel(experiment=experiment) # filter experiment - data.loc[data.experiment == experiment, :, :] - data = data.dropna( - "neuroid" - ) # not all subjects have done both experiments, drop those that haven't - data.attrs["identifier"] = f"{data.identifier}.{experiment}" - if "time" in data.dims: - data = data.drop("time").squeeze("time") - return data - - def _load_ceiling( - self, - identifier: str, - version_id: str, - sha1: str, - assembly_prefix="ceiling_", - raw_kwargs=None, - ): - ceiling = load_from_s3( - identifier, - cls=Score, - assembly_prefix=assembly_prefix, - version_id=version_id, - sha1=sha1, - ) - if raw_kwargs: # recursively load raw attributes - raw = self._load_ceiling( - identifier=identifier, - assembly_prefix=assembly_prefix + "raw_", - **raw_kwargs, - ) - ceiling.attrs["raw"] = raw - return ceiling - - def __call__(self, candidate: ArtificialSubject) -> Score: - candidate.perform_neural_recording( - recording_target=ArtificialSubject.RecordingTarget.language_system, - recording_type=ArtificialSubject.RecordingType.fMRI, - ) - stimuli = self.data["stimuli"] - predictions = candidate.digest_text(stimuli.values)["neural"] - predictions["presentation"] = "presentation", stimuli["sentence"].values - raw_score = self.metric( - predictions, - self.data, - ) - score = raw_score # ceiling_normalize(raw_score, self.ceiling) - return score - - -benchmark_registry[ - "Pereira2018_v2022.243sentences-linreg_pearsonr" -] = Pereira2018_243sentences -benchmark_registry[ - "Pereira2018_v2022.384sentences-linreg_pearsonr" -] = Pereira2018_384sentences +benchmark_registry["Pereira2018_v2022.243sentences-linreg_pearsonr"] = Pereira2018_243sentences +benchmark_registry["Pereira2018_v2022.384sentences-linreg_pearsonr"] = Pereira2018_384sentences diff --git a/brainscore_language/benchmarks/pereira2018_v2022/benchmark.py b/brainscore_language/benchmarks/pereira2018_v2022/benchmark.py new file mode 100644 index 00000000..0736af0a --- /dev/null +++ b/brainscore_language/benchmarks/pereira2018_v2022/benchmark.py @@ -0,0 +1,157 @@ +from brainio.assemblies import NeuroidAssembly +from brainscore_core.benchmarks import BenchmarkBase +from brainscore_core.metrics import Score +from brainscore_language.artificial_subject import ArtificialSubject +from brainscore_language.utils.ceiling import ceiling_normalize +from brainscore_language.utils.s3 import load_from_s3 +from brainscore_language import load_dataset, load_metric + +import logging + +logger = logging.getLogger(__name__) + +BIBTEX = """@article{pereira2018toward, + title={Toward a universal decoder of linguistic meaning from brain activation}, + author={Pereira, Francisco and Lou, Bin and Pritchett, Brianna and Ritter, Samuel and Gershman, Samuel J + and Kanwisher, Nancy and Botvinick, Matthew and Fedorenko, Evelina}, + journal={Nature communications}, + volume={9}, + number={1}, + pages={1--13}, + year={2018}, + publisher={Nature Publishing Group} +}""" + + +def Pereira2018_243sentences(): + return _Pereira2018LinregPearsonr( + experiment="PereiraE3_72pass", + ceiling_s3_kwargs=dict( + sha1="64ebaded266506df698ed249685ffadabf017951", + version_id="H8Apri0513nJ8Qy54dp3IitpuLOL9UyK", + raw_kwargs=dict( + version_id="ws6mzNrk9AkN2rFP3vVzBcSSgF3n0J5O", + sha1="a72adbc813e1eaff5fbb3e4fcb9c1b202d3cdbf2", + raw_kwargs=dict( + version_id="LYWHrqobBWLYYKMwMz4bEUsB2s..9QTY", + sha1="0b5477ead370939a65b824629f72fad40da0543f", + ), + ), + ), + ) + + +def Pereira2018_384sentences(): + return _Pereira2018LinregPearsonr( + experiment="PereiraE2_96pass", + ceiling_s3_kwargs=dict( + sha1="638b7a0d7a08a851ace19af9489801c9c1a4c05b", + version_id="OFu.5imbaIbrJvD6xny_G55XfEzOU37o", + raw_kwargs=dict( + sha1="3348a8c489f5d091c5f4ded9d50f0c895e2368f4", + version_id="c6zAAvSRxerQ3kHRf0I7WnSoFZDSd3kG", + raw_kwargs=dict( + sha1="75a88348f30135bb0453459ba8eddf33eabf49ff", + version_id="cA3D7oKIcpx.qZz9tLOFO_kAkOduht4W", + ), + ), + ), + ) + + +class _Pereira2018LinregPearsonr(BenchmarkBase): + """ + Evaluate model ability to predict neural activity in the human language system in response to natural sentences, + recorded by Pereira et al. 2018. + Alignment of neural activity between model and human subjects is evaluated via cross-validated linear predictivity. + + This benchmark builds off the behavioral benchmark introduced + in Schrimpf et al. 2021 (https://www.pnas.org/doi/10.1073/pnas.2105646118), but: + + * computes neural alignment to each of the two experiments ({243,384}sentences) separately, as well as ceilings + * requires the model to have committed to neural readouts (e.g. layer 41 corresponds to the language system), + rather than testing every layer separately + + Each of these benchmarks evaluates one of the two experiments, the overall Pereira2018-linear score is the mean of + the two ceiling-normalized scores. + """ + + def __init__( + self, + experiment: str, + ceiling_s3_kwargs: dict, + ): + self.data = self._load_data(experiment) + self.metric = load_metric( + "linreg_pearsonr", + crossvalidation_kwargs={ + "split_coord": "sentence", + }, + regression_kwargs={ + "stimulus_coord": "sentence", + }, + correlation_kwargs={ + "correlation_coord": "sentence", + "neuroid_coord": "neuroid", + }, + ) + identifier = f"Pereira2018_v2022.{experiment}-linreg_pearsonr" + ceiling = self._load_ceiling(identifier=identifier, **ceiling_s3_kwargs) + super(_Pereira2018LinregPearsonr, self).__init__( + identifier=identifier, + ceiling_func=ceiling, + version=1, + parent="Pereira2018-linear", + bibtex=BIBTEX, + ) + + def _load_data(self, experiment: str) -> NeuroidAssembly: + data = load_dataset("Pereira2018_v2022.language") + # data = data.sel(experiment=experiment) # filter experiment + data.loc[data.experiment == experiment, :, :] + data = data.dropna( + "neuroid" + ) # not all subjects have done both experiments, drop those that haven't + data.attrs["identifier"] = f"{data.identifier}.{experiment}" + if "time" in data.dims: + data = data.drop("time").squeeze("time") + return data + + def _load_ceiling( + self, + identifier: str, + version_id: str, + sha1: str, + assembly_prefix="ceiling_", + raw_kwargs=None, + ): + ceiling = load_from_s3( + identifier, + cls=Score, + assembly_prefix=assembly_prefix, + version_id=version_id, + sha1=sha1, + ) + if raw_kwargs: # recursively load raw attributes + raw = self._load_ceiling( + identifier=identifier, + assembly_prefix=assembly_prefix + "raw_", + **raw_kwargs, + ) + ceiling.attrs["raw"] = raw + return ceiling + + def __call__(self, candidate: ArtificialSubject) -> Score: + candidate.perform_neural_recording( + recording_target=ArtificialSubject.RecordingTarget.language_system, + recording_type=ArtificialSubject.RecordingType.fMRI, + ) + stimuli = self.data["stimuli"] + predictions = candidate.digest_text(stimuli.values)["neural"] + predictions["presentation"] = "presentation", stimuli["sentence"].values + raw_score = self.metric( + predictions, + self.data, + ) + score = raw_score # ceiling_normalize(raw_score, self.ceiling) + return score From ffe034126cf5b60e2be5612a0e3a9d1e53ed9be3 Mon Sep 17 00:00:00 2001 From: aalok-sathe Date: Fri, 4 Nov 2022 09:28:09 -0500 Subject: [PATCH 04/10] remove now-defunct `load_from_disk` import --- brainscore_language/data/pereira2018_v2022/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/brainscore_language/data/pereira2018_v2022/__init__.py b/brainscore_language/data/pereira2018_v2022/__init__.py index a514988b..d8fdd123 100644 --- a/brainscore_language/data/pereira2018_v2022/__init__.py +++ b/brainscore_language/data/pereira2018_v2022/__init__.py @@ -1,9 +1,9 @@ import logging from brainscore_language import data_registry - from brainscore_language.utils.s3 import load_from_s3 -from brainscore_language.utils.local import load_from_disk + +# from brainscore_language.utils.local import load_from_disk _logger = logging.getLogger(__name__) From 2ac901b37dd796bdf60a0efe3ba9d6a66fd70858 Mon Sep 17 00:00:00 2001 From: aalok-sathe Date: Fri, 4 Nov 2022 15:24:03 -0500 Subject: [PATCH 05/10] change `ceiling_func` to `ceiling` --- brainscore_language/benchmarks/futrell2018/benchmark.py | 2 +- brainscore_language/benchmarks/pereira2018/benchmark.py | 2 +- brainscore_language/benchmarks/pereira2018_v2022/benchmark.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/brainscore_language/benchmarks/futrell2018/benchmark.py b/brainscore_language/benchmarks/futrell2018/benchmark.py index 924f9574..6c5c0a49 100644 --- a/brainscore_language/benchmarks/futrell2018/benchmark.py +++ b/brainscore_language/benchmarks/futrell2018/benchmark.py @@ -33,9 +33,9 @@ def __init__(self): ceiling = ceiler(self.data) super(Futrell2018Pearsonr, self).__init__( identifier="Futrell2018-pearsonr", + ceiling=ceiling, version=1, parent="behavior", - ceiling_func=ceiling, bibtex=self.data.bibtex, ) diff --git a/brainscore_language/benchmarks/pereira2018/benchmark.py b/brainscore_language/benchmarks/pereira2018/benchmark.py index caaeda5e..9165cf5f 100644 --- a/brainscore_language/benchmarks/pereira2018/benchmark.py +++ b/brainscore_language/benchmarks/pereira2018/benchmark.py @@ -84,9 +84,9 @@ def __init__(self, experiment: str, ceiling_s3_kwargs: dict): ceiling = self._load_ceiling(identifier=identifier, **ceiling_s3_kwargs) super(_Pereira2018ExperimentLinear, self).__init__( identifier=identifier, + ceiling=ceiling, version=1, parent="Pereira2018-linear", - ceiling_func=ceiling, bibtex=BIBTEX, ) diff --git a/brainscore_language/benchmarks/pereira2018_v2022/benchmark.py b/brainscore_language/benchmarks/pereira2018_v2022/benchmark.py index 0736af0a..ba8d574e 100644 --- a/brainscore_language/benchmarks/pereira2018_v2022/benchmark.py +++ b/brainscore_language/benchmarks/pereira2018_v2022/benchmark.py @@ -99,7 +99,7 @@ def __init__( ceiling = self._load_ceiling(identifier=identifier, **ceiling_s3_kwargs) super(_Pereira2018LinregPearsonr, self).__init__( identifier=identifier, - ceiling_func=ceiling, + ceiling=ceiling, version=1, parent="Pereira2018-linear", bibtex=BIBTEX, From a650927f914076f1e362c2b15487b37a291797a6 Mon Sep 17 00:00:00 2001 From: aalok-sathe Date: Sat, 5 Nov 2022 21:52:11 -0400 Subject: [PATCH 06/10] fix metric name --- brainscore_language/benchmarks/pereira2018_v2022/benchmark.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/brainscore_language/benchmarks/pereira2018_v2022/benchmark.py b/brainscore_language/benchmarks/pereira2018_v2022/benchmark.py index ba8d574e..21591f3f 100644 --- a/brainscore_language/benchmarks/pereira2018_v2022/benchmark.py +++ b/brainscore_language/benchmarks/pereira2018_v2022/benchmark.py @@ -83,7 +83,7 @@ def __init__( ): self.data = self._load_data(experiment) self.metric = load_metric( - "linreg_pearsonr", + "linear_pearsonr", crossvalidation_kwargs={ "split_coord": "sentence", }, @@ -101,7 +101,7 @@ def __init__( identifier=identifier, ceiling=ceiling, version=1, - parent="Pereira2018-linear", + parent="Pereira2018-linreg_pearsonr", bibtex=BIBTEX, ) From 1ea240aa7af11f097dcace9db4b3d2b66a92c0fd Mon Sep 17 00:00:00 2001 From: aalok-sathe Date: Wed, 16 Nov 2022 16:47:26 -0500 Subject: [PATCH 07/10] update benchmark to fix proper by-experiment data selection --- .../benchmarks/pereira2018_v2022/benchmark.py | 26 +++++++++---------- .../data/pereira2018_v2022/__init__.py | 23 +++++----------- 2 files changed, 20 insertions(+), 29 deletions(-) diff --git a/brainscore_language/benchmarks/pereira2018_v2022/benchmark.py b/brainscore_language/benchmarks/pereira2018_v2022/benchmark.py index 21591f3f..55741d7b 100644 --- a/brainscore_language/benchmarks/pereira2018_v2022/benchmark.py +++ b/brainscore_language/benchmarks/pereira2018_v2022/benchmark.py @@ -27,14 +27,14 @@ def Pereira2018_243sentences(): return _Pereira2018LinregPearsonr( experiment="PereiraE3_72pass", ceiling_s3_kwargs=dict( - sha1="64ebaded266506df698ed249685ffadabf017951", - version_id="H8Apri0513nJ8Qy54dp3IitpuLOL9UyK", + sha1="f74a68b74568e44257778effc40c56d223cc5219", + version_id="g1oQg1J.HBuG2WwD6fwjNsjhMvsbQpjA", raw_kwargs=dict( - version_id="ws6mzNrk9AkN2rFP3vVzBcSSgF3n0J5O", - sha1="a72adbc813e1eaff5fbb3e4fcb9c1b202d3cdbf2", + sha1="e64c0c8e73169340f30f7135d1f2f4f1275cb2ad", + version_id="8NMVc498YgYz7TZiocfeueCB5uxBt42G", raw_kwargs=dict( - version_id="LYWHrqobBWLYYKMwMz4bEUsB2s..9QTY", - sha1="0b5477ead370939a65b824629f72fad40da0543f", + sha1="5b97b28406e18ea345a254d15562f4d271c6dc59", + version_id="cNV3rCdVCAu2FjbQyoNGtFZNBfkh0q5d", ), ), ), @@ -45,14 +45,14 @@ def Pereira2018_384sentences(): return _Pereira2018LinregPearsonr( experiment="PereiraE2_96pass", ceiling_s3_kwargs=dict( - sha1="638b7a0d7a08a851ace19af9489801c9c1a4c05b", - version_id="OFu.5imbaIbrJvD6xny_G55XfEzOU37o", + sha1="d4b55d2d7b44a27d45c375b938584b6b2c9b2265", + version_id="U_kebareV3iUmbUwg4P1MVJJdj_ImpcP", raw_kwargs=dict( - sha1="3348a8c489f5d091c5f4ded9d50f0c895e2368f4", - version_id="c6zAAvSRxerQ3kHRf0I7WnSoFZDSd3kG", + sha1="e64c0c8e73169340f30f7135d1f2f4f1275cb2ad", + version_id="yKvtiG3W5FUrzRvi7nXdk7tH2x4C0H_F", raw_kwargs=dict( - sha1="75a88348f30135bb0453459ba8eddf33eabf49ff", - version_id="cA3D7oKIcpx.qZz9tLOFO_kAkOduht4W", + sha1="5b97b28406e18ea345a254d15562f4d271c6dc59", + version_id="2lHr8sfxTfZ25j88on1Wve2mmPnk3Cfp", ), ), ), @@ -108,7 +108,7 @@ def __init__( def _load_data(self, experiment: str) -> NeuroidAssembly: data = load_dataset("Pereira2018_v2022.language") # data = data.sel(experiment=experiment) # filter experiment - data.loc[data.experiment == experiment, :, :] + data = data.loc[data.experiment == experiment, :, :] data = data.dropna( "neuroid" ) # not all subjects have done both experiments, drop those that haven't diff --git a/brainscore_language/data/pereira2018_v2022/__init__.py b/brainscore_language/data/pereira2018_v2022/__init__.py index d8fdd123..90a34a8f 100644 --- a/brainscore_language/data/pereira2018_v2022/__init__.py +++ b/brainscore_language/data/pereira2018_v2022/__init__.py @@ -20,19 +20,10 @@ }""" -try: - data_registry["Pereira2018_v2022.language"] = lambda: load_from_s3( - identifier="Pereira2018ROI", - # version_id="8kEzAXaWnxjMIC95MrhSJ3gtP2a1W.Y7", - version_id="i1XqPs72b82kfzj9kZ9EQwVZjLltmlSz", - # sha1="235ecf1f2abc38cb892c0a7b5c9c465434f153c5", - sha1="d1dc23f2157cfcf6f78abf781a92480cd919ad1c", - assembly_prefix="assembly_", - ) -except Exception as e: - # TODO: check what class of Exception is thrown by S3 object not found, and catch that - # while we don't know, we will continue to throw the exception rather than handling it - raise e - data_registry["Pereira2018_v2022.language"] = lambda: load_from_disk( - "Pereira2018ROI" - ) + +data_registry["Pereira2018_v2022.language"] = lambda: load_from_s3( + identifier="Pereira2018ROI", + version_id="pwsEbmuuEc60F2z0Y8xVw7i2RgtjyEf9", + sha1="63543362c5e8175efb40721016edb9963b4f7e1e", + assembly_prefix="assembly_", +) From 0e77b6fac0537eb9bb162050b96939059e60f77c Mon Sep 17 00:00:00 2001 From: aalok-sathe Date: Thu, 17 Nov 2022 09:31:28 -0500 Subject: [PATCH 08/10] change occurrences of 'sentence' with 'stimuli' in the test data script --- brainscore_language/data/pereira2018_v2022/test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/brainscore_language/data/pereira2018_v2022/test.py b/brainscore_language/data/pereira2018_v2022/test.py index 74dd30db..f94c192e 100644 --- a/brainscore_language/data/pereira2018_v2022/test.py +++ b/brainscore_language/data/pereira2018_v2022/test.py @@ -24,12 +24,12 @@ def test_language(self): assert len(assembly["presentation"]) == 243 + 384 _logger.info(f'no. of presentation IDs == {len(assembly["presentation"])}') - assert len(set(assembly["sentence"].values)) == 243 + 384 - _logger.info(f'no. of stimuli == {len(assembly["sentence"].values)}') + assert len(set(assembly["stimuli"].values)) == 243 + 384 + _logger.info(f'no. of stimuli == {len(assembly["stimuli"].values)}') assert ( "The concert pianist went blind in adulthood." - in assembly["sentence"].values + in assembly["stimuli"].values ) _logger.info( f'stimulus "The concert pianist went blind in adulthood." is present!' From 855c58c93a8cb5186cf1276bba28fdfc47962d3d Mon Sep 17 00:00:00 2001 From: aalok-sathe Date: Thu, 17 Nov 2022 11:20:02 -0500 Subject: [PATCH 09/10] still more incorrectly named dimension in benchmark? --- .../benchmarks/pereira2018_v2022/benchmark.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/brainscore_language/benchmarks/pereira2018_v2022/benchmark.py b/brainscore_language/benchmarks/pereira2018_v2022/benchmark.py index 55741d7b..59cef365 100644 --- a/brainscore_language/benchmarks/pereira2018_v2022/benchmark.py +++ b/brainscore_language/benchmarks/pereira2018_v2022/benchmark.py @@ -85,13 +85,13 @@ def __init__( self.metric = load_metric( "linear_pearsonr", crossvalidation_kwargs={ - "split_coord": "sentence", + "split_coord": "stimuli", }, regression_kwargs={ - "stimulus_coord": "sentence", + "stimulus_coord": "stimuli", }, correlation_kwargs={ - "correlation_coord": "sentence", + "correlation_coord": "stimuli", "neuroid_coord": "neuroid", }, ) @@ -148,7 +148,7 @@ def __call__(self, candidate: ArtificialSubject) -> Score: ) stimuli = self.data["stimuli"] predictions = candidate.digest_text(stimuli.values)["neural"] - predictions["presentation"] = "presentation", stimuli["sentence"].values + predictions["presentation"] = "presentation", stimuli["stimuli"].values raw_score = self.metric( predictions, self.data, From 104e80061329aa835bc745f1052543ee6ad6c3ee Mon Sep 17 00:00:00 2001 From: aalok-sathe Date: Mon, 28 Nov 2022 13:41:29 -0500 Subject: [PATCH 10/10] change coordinate name in test_dummy_bad --- brainscore_language/benchmarks/pereira2018_v2022/test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/brainscore_language/benchmarks/pereira2018_v2022/test.py b/brainscore_language/benchmarks/pereira2018_v2022/test.py index 56f6bb98..8047a222 100644 --- a/brainscore_language/benchmarks/pereira2018_v2022/test.py +++ b/brainscore_language/benchmarks/pereira2018_v2022/test.py @@ -44,8 +44,8 @@ def test_dummy_bad(self, experiment, expected_score): neural_activity = NeuroidAssembly( neural_activity, coords={ - "stimulus_seq": ("presentation", np.arange(experiment)), - "stimulus_num": ("presentation", np.arange(experiment)), + "stimuli": ("presentation", np.arange(experiment)), + # "stimuli_id": ("presentation", np.arange(experiment)), "neuroid_id": ("neuroid", np.arange(25)), "region": ("neuroid", ["some_region"] * 25), },