From 729611ebabe74b637af408113d430e6dbcd25fb9 Mon Sep 17 00:00:00 2001 From: Hao Lyu Date: Thu, 25 Jan 2024 20:57:08 -0500 Subject: [PATCH 1/3] Null Results Uploading new null results guide. The latest version is completely different from the first version. It's written by Jenny and previously reviewed by both NI and GB. This is the first time that this version is uploaded on Github. Action request: Please make a decision whether this guide is ready for a final review. --- guides/interpretation/null-results.bib | 133 ++++++++++++++++------ guides/interpretation/null-results_en.qmd | 106 +++++++---------- 2 files changed, 135 insertions(+), 104 deletions(-) diff --git a/guides/interpretation/null-results.bib b/guides/interpretation/null-results.bib index cf9cc67..0bfedd5 100644 --- a/guides/interpretation/null-results.bib +++ b/guides/interpretation/null-results.bib @@ -1,42 +1,101 @@ -%% This BibTeX bibliography file was created using BibDesk. -%% https://bibdesk.sourceforge.io/ +@article{adida_2020, + title = {When {{Does Information Influence Voters}}? {{The Joint Importance}} of {{Salience}} and {{Coordination}}}, + shorttitle = {When {{Does Information Influence Voters}}?}, + author = {Adida, Claire and Gottlieb, Jessica and Kramon, Eric and McClendon, Gwyneth}, + date = {2020-05-01}, + journaltitle = {Comparative Political Studies}, + volume = {53}, + number = {6}, + pages = {851--891}, + publisher = {{SAGE Publications Inc}}, + issn = {0010-4140}, + doi = {10.1177/0010414019879945}, + url = {https://doi.org/10.1177/0010414019879945}, + urldate = {2023-12-22}, + abstract = {Scholars argue that access to information about a politician’s programmatic performance helps voters reward good performers and punish poor ones. But in places where resources are made conditional on collective electoral behavior, voters may not want to defect to vote for a strong legislative performer if they do not believe that others will. We argue that two conditions must hold for information about politician performance to affect voter behavior: Voters must care about the information and believe that others in their constituency care as well. In a field experiment around legislative elections in Benin, voters rewarded good programmatic performance only when information was both made relevant to voters and widely disseminated within the electoral district. Otherwise, access to positive legislative performance information actually lowered vote share for the incumbent’s party. These results demonstrate the joint importance of Salience and voter coordination in shaping information’s impact in clientelistic democracies.}, + langid = {english} +} -%% Created for Matt Lisiecki at 2023-02-09 09:48:32 -0500 +@article{arel-bundock_2023, + title = {Quantitative {{Political Science Research}} Is {{Greatly Underpowered}}}, + author = {Arel-Bundock, Vincent and Briggs, Ryan and Doucouliagos, Hristos and Aviña, Marco M. and Stanley, T. D.}, + date = {2023-12-22}, + publisher = {{OSF}}, + doi = {10.31219/osf.io/7vy2f}, + url = {https://osf.io/7vy2f}, + urldate = {2023-12-22}, + abstract = {We analyze the statistical power of political science research by collating over 16,000 hypothesis tests from about 2,000 articles. Even with generous assumptions, the median analysis has about 10\% power, and only about 1 in 10 tests have at least 80\% power to detect the consensus effects reported in the literature. There is also substantial heterogeneity in tests across research areas, with some being characterized by high-power but most having very low power. To contextualize our findings, we survey political methodologists to assess their expectations about power levels. Most methodologists greatly overestimate the statistical power of political science research.}, + langid = {american}, + file = {C\:\\Users\\jham9\\Zotero\\storage\\Y6VJE9ZC\\Arel-Bundock et al. - 2023 - Quantitative Political Science Research is Greatly.pdf;C\:\\Users\\jham9\\Zotero\\storage\\FPG97PEI\\7vy2f.html} +} +@article{bjorkman_2009, + title = {Power to the {{People}}: {{Evidence}} from a {{Randomized Field Experiment}} on {{Community-Based Monitoring}} in {{Uganda}}}, + shorttitle = {Power to the {{People}}}, + author = {Björkman, Martina and Svensson, Jakob}, + date = {2009}, + journaltitle = {The Quarterly Journal of Economics}, + volume = {124}, + number = {2}, + eprint = {40506242}, + eprinttype = {jstor}, + pages = {735--769}, + issn = {0033-5533}, + url = {http://www.jstor.org/stable/40506242}, + urldate = {2018-03-07}, + abstract = {This paper presents a randomized field experiment on community-based monitoring of public primary health care providers in Uganda. Through two rounds of village meetings, localized nongovernmental organizations encouraged communities to be more involved with the state of health service provision and strengthened their capacity to hold their local health providers to account for performance. A year after the intervention, treatment communities are more involved in monitoring the provider, and the health workers appear to exert higher effort to serve the community. We document large increases in utilization and improved health outcomes—reduced child mortality and increased child weight—that compare favorably to some of the more successful community-based intervention trials reported in the medical literature.} +} -%% Saved with string encoding Unicode (UTF-8) +@article{christensen_2021, + title = {Building {{Resilient Health Systems}}: {{Experimental Evidence}} from {{Sierra Leone}} and {{The}} 2014 {{Ebola Outbreak}}*}, + shorttitle = {Building {{Resilient Health Systems}}}, + author = {Christensen, Darin and Dube, Oeindrila and Haushofer, Johannes and Siddiqi, Bilal and Voors, Maarten}, + date = {2021-05-01}, + journaltitle = {The Quarterly Journal of Economics}, + shortjournal = {The Quarterly Journal of Economics}, + volume = {136}, + number = {2}, + pages = {1145--1198}, + issn = {0033-5533}, + doi = {10.1093/qje/qjaa039}, + url = {https://doi.org/10.1093/qje/qjaa039}, + urldate = {2023-12-22}, + abstract = {Skepticism about the quality of health systems and their consequent underuse are thought to contribute to high rates of mortality in the developing world. The perceived quality of health services may be especially critical during epidemics, when people choose whether to cooperate with response efforts and frontline health workers. Can improving the perceived quality of health care promote community health and ultimately help to contain epidemics? We leverage a field experiment in Sierra Leone to answer this question in the context of the 2014 West African Ebola crisis. Two years before the outbreak, we randomly assigned two interventions to government-run health clinics—one focused on community monitoring, and the other conferred nonfinancial awards to clinic staff. Prior to the Ebola crisis, both interventions increased clinic utilization and patient satisfaction. Community monitoring additionally improved child health, leading to 38\% fewer deaths of children under age five. Later, during the crisis, the interventions also increased reporting of Ebola cases by 62\%, and community monitoring significantly reduced Ebola-related deaths. Evidence on mechanisms suggests that both interventions improved the perceived quality of health care, encouraging patients to report Ebola symptoms and receive medical care. Improvements in health outcomes under community monitoring suggest that these changes partly reflect a rise in the underlying quality of administered care. Overall, our results indicate that promoting accountability not only has the power to improve health systems during normal times, but can also make them more resilient to emergent crises.}, + file = {C\:\\Users\\jham9\\Zotero\\storage\\E5WNJFQX\\Christensen et al. - 2021 - Building Resilient Health Systems Experimental Ev.pdf;C\:\\Users\\jham9\\Zotero\\storage\\K8LDF37S\\5996193.html} +} +@article{franco_2014, + title = {Publication Bias in the Social Sciences: {{Unlocking}} the File Drawer}, + shorttitle = {Publication Bias in the Social Sciences}, + author = {Franco, Annie and Malhotra, Neil and Simonovits, Gabor}, + date = {2014-09-19}, + journaltitle = {Science}, + volume = {345}, + number = {6203}, + pages = {1502--1505}, + publisher = {{American Association for the Advancement of Science}}, + doi = {10.1126/science.1255484}, + url = {https://www.science.org/doi/full/10.1126/science.1255484}, + urldate = {2023-12-22}, + abstract = {We studied publication bias in the social sciences by analyzing a known population of conducted studies—221 in total—in which there is a full accounting of what is published and unpublished. We leveraged Time-sharing Experiments in the Social Sciences (TESS), a National Science Foundation–sponsored program in which researchers propose survey-based experiments to be run on representative samples of American adults. Because TESS proposals undergo rigorous peer review, the studies in the sample all exceed a substantial quality threshold. Strong results are 40 percentage points more likely to be published than are null results and 60 percentage points more likely to be written up. We provide direct evidence of publication bias and identify the stage of research production at which publication bias occurs: Authors do not write up and submit null findings.}, + file = {C:\Users\jham9\Zotero\storage\74ZBSQ5P\Franco et al. - 2014 - Publication bias in the social sciences Unlocking.pdf} +} - -@article{bhatti_et_al_2016, - author = {Bhatti, Yosef and Dahlgaard, Jens Olav and Hansen, Jonas Hedegaard and Hansen, Kasper M.}, - date-added = {2023-02-09 09:46:29 -0500}, - date-modified = {2023-02-09 09:47:28 -0500}, - journal = {British Journal of Political Science}, - number = {1}, - pages = {279-290}, - title = {Is Door-to-Door Canvassing Effective in Europe? Evidence from a Meta-study across Six European Countries}, - volume = {49}, - year = {2016}} - -@article{bhatti_et_al_2018, - author = {Bhatti, Yosef and Dahlgaard, Jens Olav and Hansen, Jonas Hedegaard and Hansen, Kasper M.}, - date-added = {2023-02-09 09:42:58 -0500}, - date-modified = {2023-02-09 09:45:43 -0500}, - journal = {West European Politics}, - number = {1}, - pages = {240-260}, - title = { Can governments use Get Out The Vote letters to solve Europe's turnout crisis? Evidence from a field experiment}, - volume = {41}, - year = {2018}} - -@article{abadie_2020, - author = {Abadie, Alberto}, - date-added = {2023-02-09 09:41:33 -0500}, - date-modified = {2023-02-09 09:42:06 -0500}, - journal = {American Economic Review: Insights}, - number = {2}, - pages = {193-208}, - title = {Statistical Nonsignificance in Empirical Economics}, - volume = {2}, - year = {2020}} +@article{lelkes_2021, + title = {Policy over Party: Comparing the Effects of Candidate Ideology and Party on Affective Polarization}, + shorttitle = {Policy over Party}, + author = {Lelkes, Yphtach}, + date = {2021-01}, + journaltitle = {Political Science Research and Methods}, + volume = {9}, + number = {1}, + pages = {189--196}, + publisher = {{Cambridge University Press}}, + issn = {2049-8470, 2049-8489}, + doi = {10.1017/psrm.2019.18}, + url = {https://www.cambridge.org/core/journals/political-science-research-and-methods/article/abs/policy-over-party-comparing-the-effects-of-candidate-ideology-and-party-on-affective-polarization/7CE28F0E9763297A765263B1F774B7A1}, + urldate = {2023-12-22}, + abstract = {At least two theories have been offered that explain the rise of affective polarization. Some scholars, relying on social identity theory, argue that as the relevance of party identification increased, Americans became more likely to see their in-party in positive terms and the out-party in negative terms. Other scholars argue that affective polarization is a reaction to increasingly extreme political actors. This study seeks to arbitrate between these two theories of affective polarization through a survey experiment which asks respondents to rate candidates whose party (or lack thereof) and ideology (or lack thereof) is randomly assigned. In line with the policy-oriented view of affective polarization, respondents reacted far more strongly to ideology than party, especially if it was the ideology of the member of the out-party.}, + langid = {english}, + keywords = {Public opinion} +} diff --git a/guides/interpretation/null-results_en.qmd b/guides/interpretation/null-results_en.qmd index 3d0b3d6..c1c38e7 100644 --- a/guides/interpretation/null-results_en.qmd +++ b/guides/interpretation/null-results_en.qmd @@ -1,99 +1,71 @@ --- -title: "10 Things Your Null Result Might Mean" +title: "10 Things Your Null Results Might Mean" author: - - name: "Rekha Balu" - url: https://cabs.mdrc.org/team/rekha-balu -bibliography: null-results.bib -image: null-results.png -abstract: | - After the excitement and hard work of running a field experiment is over, it’s not uncommon to hear policymakers and researchers express disappointment when they end up hearing that the intervention did not have a detectable impact. - This guide explains that a null result rarely means “the intervention didn’t work,” even though that tends to be the shorthand many people use. Instead, a null result can reflect the myriad design choices that policy implementers and researchers make in the course of developing and testing an intervention. After all, people tend to label [hypothesis tests](https://methods.egap.org/guides/analysis-procedures/hypothesis-testing_en.html) with high p-values as “null results”, and hypothesis tests (as summaries of information about design and data) can produce large p-values for many reasons. Policymakers can make better decisions about what to do with a null result when they understand how and why they got that result. + - name: "Jennifer A. Hamilton" + url: https://sites.google.com/view/jennifer-a-hamilton/ +abstract: "A null result is when a hypothesis test indicates that there is not enough evidence to say an intervention (treatment) changed outcomes in a study. Null results might occur because the intervention truly has no effect or because there is not enough information to tell that an effect exists." +bibliography: null_results.bib +format: docx +always_allow_html: yes --- -Imagine you lead the department of education for a government and are wondering about how to boost student attendance. You decide to consider a text message intervention that offers individual students counseling. Counselors at each school can help students address challenges specifically related to school attendance. Your team runs a randomized trial of the intervention, and tells you there is a null result. +# A null result indicates that a study did not generate evidence to conclude that an intervention changed outcomes. -How should you understand the null result, and what should you do about it? It could be a result of unmet challenges at several stages of your work -- in the way the intervention is designed, the way the intervention is implemented, or the way study is designed Below are 10 things to consider when interpreting your null result. - -Intervention Design -== +There may not be evidence either because the intervention does not, in fact, change outcomes or because the study failed to gather adequate evidence of an effect that actually exists. Findings, including null results, are a function of both how the world works and the approaches used to learn about it. -## 1. Your intervention theory and approach are mismatched to the problem. -You delivered a counseling intervention because you thought that students needed support to address challenges in their home life. However, students who had the greatest needs never actually met with a counselor, in part because they did not trust adults at the school. The theory of change assumed that absenteeism was a function primarily of a student’s personal decisions or family circumstances and that the offer of counseling without changes to school climate would be sufficient; it did not account appropriately for low levels of trust in teacher-student relationships. Therefore, this null effect does not suggest that counseling per se cannot boost attendance, but that counseling in the absence of other structural or policy changes or in the context of low-trust schools may not be sufficient. +To learn more about hypothesis testing and see sample code, see [10 Things to Know About Hypothesis Testing](https://methods.egap.org/guides/analysis-procedures/hypothesis-testing_en.html). -***How can you tell if***...you have a mismatch between your theory of change and the problem that needs to be solved? List all potential barriers and consider how they connect. Does the intervention as designed address only one of those barriers, and, if so, can it succeed without addressing others? Are there assumptions made about one source or one cause that may undermine the success of the intervention? +# Well-designed studies with null effects are important contributions to expanding knowledge and understanding of the world. -## 2. Your intervention strength or dosage is too low for the problem or outcome of interest. -After talking to experts, you learn that counseling interventions can build trust, but usually require meetings that are more frequent and regular than your intervention offered to have the potential for an effect. Maybe your “dose” of services is too small. +Researchers sometimes think of null results as neither interesting nor useful. However, when studies with null results are not published,[^1] researchers might waste limited time and resources conducting similar studies. The underrepresentation of null results also means that the published literature tends to overestimate true effect sizes. It is therefore important for researchers to write up and make publicly available results from each experiment they conduct. -***How can you tell if***...you did not have a sufficient “dose”? Even if no existing services tackle your problem of interest, consider what is a minimum level, strength, or dose that is both feasible to implement and could yield an effect. When asking sites what they are willing to take on, beware of defaulting to the lowest dose. The more complex the problem or outcome is to move, the stronger or more comprehensive the intervention may need to be. +[^1]: Currently, studies with null results are underrepresented in published research. To learn more about *publication bias* in the social sciences, see @franco_2014. -## 3. Your intervention does not represent a large enough enhancement over usual services. -In your position at the state department of education, you learn that students at the target schools were already receiving some counseling and support services. Even though the existing services were not sufficient to boost attendance to the targeted levels, the new intervention did not add enough content or frequency of the counseling services to reach those levels either---the intervention yielded show-up rates that were about the same as existing services. So this null effect does not reflect that counseling has no effect, but rather that the version of counseling your intervention offered was not effective over and above existing counseling services. +# Sometimes null results are an artifact of true zero effects: null results might reflect that an intervention in fact does not move outcomes. -***How can you tell if***...the relative strength of your intervention was not sufficient to yield an effect? Take stock of the structure and content of existing services, and consider if the extent or form in which clients respond to existing services indicates that the theory of change or approach needs to be revised. If the theory holds, use existing services as a benchmark and consider whether your proposed intervention needs to include something supplementary and/or something complementary. - - -Intervention Implementation -== -Programs rarely rollout exactly as intended, but some variations are more problematic than others. +If the intervention does not work, is too weak, or if the outcomes of interest are resistant to change, an intervention may simply not affect outcomes. For example, [Metaketa I](https://egap.org/our-work/the-metaketa-initiative/round1-information-accountability/) theorized that providing citizens with information about incumbent performance would enhance political accountability. In Benin, one research team found that a light-touch information intervention did not change voter behavior [@adida_2020]. -## 4. Your implementation format was not reliable. -In the schools in your study, counseling interventions sometimes occurred in person, sometimes happened by text message, sometimes by phone. Anticipating and allowing for some variation and adaptation is important. Intervention dosage and strength is often not delivered as designed nor to as many people as expected. +# Sometimes null results are an artifact of true zero effects: null results might not generalize: the intervention truly does not work in the study context but it may work in others -But unplanned variations in format can reflect a host of selection bias issues, such that you cannot disentangle whether counseling as a concept does not work or whether certain formats of outreach did not work. This is especially important to guard against if you intend to test specific channels or mechanisms critical to your theory of change. +Context matters! The same study carried out in two different places or at two different times can generate different results. Sometimes an intervention that is ineffective in one setting will work in other contexts. Because of this, researchers should not conclude an intervention can never change outcomes on the basis of one study. -***How can you tell if***...an unreliable format is the reason for your null? Were you able to specify or standardize formats in a checklist? Could you leave enough discretion but still incentivize fidelity? Pre-specifying what the intervention should look like can help staff and researchers monitor along the way and correct inconsistencies or deviations that may affect the results. This could include a training protocol for those implementing the intervention. If nothing was specified or no one was trained, then the lack of consistency may be part of the explanation. - -## 5. Your intervention and outcome measure are mismatched to your randomization design. -You expected counseling to be more effective in schools with higher student-to-teacher ratios, but did not block randomize by class size (for more on block randomization, see our guide on [10 Things to Know About Randomization](https://egap.org/resource/10-things-to-know-about-randomization/)). then it may no longer have the potential to be more effective for students in high class size schools. +One illustration of this scenario is a series of studies in Africa testing whether community-based monitoring of health services improve healthcare uptake and outcomes. One initial study in Uganda found promising results [@bjorkman_2009]. Ten years later, two additional teams replicated the intervention in Sierra Leone [@christensen_2021] and [Uganda](https://egap.org/resource/does-bottom-up-accountability-work-evidence-from-uganda/). The Sierra Leonean study also found promising results. In contrast, the Ugandan replication study had largely null findings. -***How can you tell if***...you have a mismatch between your intervention and randomization design? Consider whether treatment effects could vary, or service delivery might occur in a [cluster](https://methods.egap.org/guides/data-strategies/cluster-randomization_en.html), or intervention concepts could [spill over](https://methods.egap.org/guides/data-strategies/spillovers_en.html), and to what extent your randomization design accounted for that. +# Sometimes null results are an artifact of true zero effects: null effects might reflect opposite reactions to the intervention by different units in the study. -Study Design -== +Studies often focus on average treatment effects. However, the average treatment effect might mask important variation in effects across units within the study. Positive effects among some units may cancel out negative effects among other units, producing an average treatment effect indistinguishable from zero. For example, some respondents in an experiment in the United States felt more warmly toward the candidate after learning about a political candidate's partisanship, while others felt more cold toward the candidate. The direction of the effect depended on the respondent's own partisanship [@lelkes_2021]. Studies that fail to take into account how the direction of the effect depends on unit characteristics may generate null effects overall, even though the treatment shifted outcomes for many units. Even a larger sample size would have produced a null average treatment effect. -## 6. Your study sample includes people whose behavior could not be moved by the intervention. -You ask schools to randomize students into an intervention or control (business-as-usual) group. Some students in both your intervention and control groups will always attend school, while some students will rarely attend school, regardless of what interventions are or are not offered to them. Your intervention’s success depends on not just whether students actually receive the message and/or believe it, but also on whether it can shift behavior among such potential responders. +To learn more about heterogeneous treatment effects and see sample code, see [10 Things to Know about Heterogeneous Treatment Effects](https://methods.egap.org/guides/research-questions/heterogeneous-effects_en.html). -If the proportion of potential responders is too small, then it may be difficult to detect an effect. In addition, your intervention may need to be targeted and modified in some way to address the needs of potential responders. +# Sometimes null results are an artifact of research design: null results might reflect an underpowered research design. -***How can you tell if***...the proportion of potential responders may be too small? Take a look at the pre-intervention attendance rate. If it is extremely low, does that rate reflect low demand or structural barriers that may limit the potential for response? Is it so high that it tells us that most people who could respond have already done so (say, 85% or higher)? Even if there is a large proportion of hypothetical potential responders, is it lower when you consider existing barriers preventing students from using counseling services that your intervention is not addressing? - -## 7. Your measure is not validated or reliable: It varies too much and systematically across sites. -As a leader of the state’s department of education, you want to measure the effectiveness of your intervention using survey data on student attitudes related to attendance. You learn that only some schools administer a new survey measuring student attitudes, and those with surveys changed the survey items so that there is not the same wording across surveys or schools. If you observe no statistically significant difference on a survey measure that is newly developed or used by only select schools, it may be difficult to know whether the intervention “has no effect” or whether the outcome is measuring something different in each school because of different wording. +A research design is underpowered when design features undermine the ability to reliably detect a true effect. Inadequate power is a ubiquitous problem in social science research. In a recent working paper reviewing 16,000 hypothesis tests from 2,000 political science articles, @arel-bundock_2023 found that the median study has only 10% power. In other words, studies of an intervention with a true effect will generate null results nine out of ten times simply because of insufficient sample size. Only 10% of the studies in the review were powered at 80%, a commonly-used threshold for minimum levels of power. Several features of research design may contribute to lack of statistical power. To learn more about statistical power and see sample code, see [10 Things to Know about Statistical Power](https://methods.egap.org/guides/assessing-designs/power_en.html). -***How can you tell if***... outcome measurement is the problem? Check to see whether the outcome is (1) collected in the same way across your sites and (2) if it means the same thing to the participants as it means to you. In addition, check on any reporting bias and if your study participants or sites face any pressure from inside or outside of their organizations to report or answer in a particular way. - -## 8. Your outcome is not validated or reliable: It varies too little. -Given the problems with the survey measure, you then decide to use administrative data from student records to measure whether students show up on time to school. But it turns out that schools used a generic definition of “on time” such that almost every student looks like they arrive on time. An outcome that does not have enough variation in it to detect an effect between intervention and control groups can be especially limiting if your intervention potentially could have had different effects on different types of students, but the outcome measure used in the study lacks the precision to capture the effects on different subgroups. +First, research designs with imprecise measurement strategies are more likely to yield null results. Measurement strategies may not be sensitive enough to capture the changes that occurred. For example, consider a study examining whether exercise improves cardiovascular health. In the short term, it will be easier to detect improvements in cardiovascular health through more finer measures like resting heart rate compared to more blunt measures like whether the individual died. To learn more about indices and see sample code, see [10 Things to Know About Indices](https://methods.egap.org/guides/data-strategies/indices_en.html). To learn more about measurement generally, see [10 Things to Know About Measurement in Experiments](https://methods.egap.org/guides/data-strategies/measurement_en.html). -***How can you tell if***...your null result arises from measures that are too coarse or subject to response biases? Pressures to report a certain kind of outcome faced by people at your sites could again yield this kind of problem with outcome measurement. So, it is again worth investigating the meaning of the outcomes as reported by the sites from the perspective of those doing the reporting. This problem differs from the kind of ceiling and floor effects discussed elsewhere in this guide; it arises more from the strategic calculations of those producing administrative data and less from the natural behavior of those students whose behavior you are trying to change. - -## 9. Your [statistical power](https://methods.egap.org/guides/assessing-designs/power_en.html) is insufficient to detect an effect for the intervention as implemented. -This may sound obvious to people with experience testing interventions at scale. But researchers and policymakers can fall into two traps: +Second, research designs with small samples can produce null results even with sensitive measurement strategies. More units are needed to distinguish between a small average effect and a true zero effect than between a large average effect and a true zero effect. An effect may be small on average if an intervention produces small effects among many treated units or if it produces larger effects among only a few treated units. -1. Thinking about statistical significance rather than what represents a meaningful and feasible effect. Although a study with an incredibly large sample size can detect small effects with precision, one does not want to trade precision for meaning. Moreover, an intervention known to be weak during the intervention design is likely to be weaker when implemented, especially across multiple sites or months. So it may not be sufficient to simply enroll more subjects to study an intervention known to be weak (even though strong research design cannot compensate for a weak intervention in any easy or direct way); +# Sometimes null results because designs interact with the real world in unexpected ways: null results might reflect because treatments or their effects spill over from units in one experimental condition to another. -2. Thinking that the only relevant test statistic for an experiment effect is a difference of means (even though we have long known that differences of means are valid but low-powered test statistics when outcomes do not neatly fit into a normal distribution). +The intervention or its effects may spill over to units that were not assigned to receive that intervention. For example, if an intervention provides cash transfers to treated units, treated units may share the cash with other units not assigned to that intervention. If an intervention reduces racial prejudice, social connections among units may create shifts in racial attitudes and norms among units assigned to control as well as those assigned to treatment. Although the intervention produces real changes in these situations, the spillovers from the intervention targets to other units makes the changes difficult to detect when comparing study units to one another. -***How can you tell if***...your null result arises mostly from low statistical power? Recall that statistical power depends on (a) effect size or intervention strength, (b) variability in outcomes, (c) the number of independent observations (often well measured with sample size), and (d) the test statistic you use. The previous discussions pointed out ways to learn whether an intervention you thought might be strong was weak, or whether an outcome that you thought might be clear could turn out to be very noisy. +Some research designs anticipate and measure spillovers. To learn more about spillovers, see [10 Things to Know About Spillovers](https://methods.egap.org/guides/data-strategies/spillovers_en.html). -A formal power analysis could also tell you that, given the variability in your outcome and the size of your effect, you would have needed a larger sample size to detect this effect reliably. For example, if you had known about the variability in administration of the treatment or the variability in the outcome (let alone surprises with missing data) in advance, your pre-field power analysis would have told you to use a different sample size. +# Sometimes null results because designs interact with the real world in unexpected ways: null results might reflect incomplete intervention implementation. -A different test statistic can also change a null result into a positive result if, say, the effect is large but it is not an effect that shifts means as much as moves people who are extreme, or has the effect of making moderate students extreme. A classic example of this problem occurs with outcomes that have very long tails -- such as those involving money, like annual earnings or auction spending. [A t-test might produce a p-value of .20 but a rank-based test might produce a p-value of < .01](https://oes.gsa.gov/projects/gsa-auctions/). The t-test is using evidence of a shift in averages (means) to reflect on the null hypothesis of no effects. The rank-based test is merely asking whether the treatment group outcomes tend to be bigger than (or smaller than) the control group outcomes (whether or not they differ in means). - -Not all nulls are the result of a flaw in design or implementation! -== +Designated control units may inadvertently receive treatment or units assigned to treatment might not receive it.[^2] For example, one [study](https://www.gsb.stanford.edu/insights/everything-can-go-wrong-field-experiment-what-do-about-it) aimed to examine the effects of conditional cash transfer programs to improve school attendance. In the control condition, participants should have received unconditional cash transfers regardless of school attendance. However, the government implementing the transfer programs required families to enroll children in school in order to receive the "unconditional" funds, thus making the control group similar to the treatment group. This dynamic can bias results toward null findings even if there is actually an effect. -## 10. Your null needs to be published. -If you addressed all the issues above related to intervention design, sample size and research design, and have a precisely estimated, statistically significant null result, it is time to publish. Your colleagues and other researchers [need to learn from this finding](https://oes.gsa.gov/assets/files/unexpected-results-2-pager.pdf), so do not keep it to yourself. +[^2]: To learn more about *non-compliance* (when whether a unit received treatment status does not match whether it was assigned to receive treatment), see [How to design and implement an experiment (life cycle)](https://methods.egap.org/guides/data-strategies/how-to_en.html). -When you have a precise null, you do not have a gap in evidence--you are generating evidence. +# Sometimes null results because designs interact with the real world in unexpected ways: null results might reflect differential attrition. -***What can you do to convince editors and reviewers they should publish your null results?*** This guide should help you reason about your null results and thus explain their importance. If other studies on your topic exist, you can also contextualize your results; for example, follow some of the ideas from @abadie_2020. +An intervention or its effects may cause treated units to drop out of the study. If units where the treatment was effective are less likely to complete the study, then the study will likely underestimate the treatment's impacts. Differential attrition may then explain null results. -For an example, see how @bhatti_et_al_2018--in their study of a Danish governmental voter turnout intervention--used previous work on face-to-face voter turnout (reported on as a meta-analysis in @bhatti_et_al_2016) to contextualize their own small effects. +To learn more about attrition and see sample code, see [10 Things to Know About Missing Data](https://methods.egap.org/guides/data-strategies/missing-data_en.html). -If you are unable to find a publication willing to include a study with null results in their journal, you can still contribute to the evidence base on the policy area under examination by making your working paper, data, and/or analysis code publicly available. Many researchers choose to do so via their personal websites; in addition, there are repositories (such as the [Open Science Framework](https://osf.io/)) that provide a platform for researchers to share their in-progress and unpublished work. +# Understanding why an intervention did not work is hard. + +Understanding why an intervention didn't work as expected can be harder than understanding why it did. Researchers design studies to confirm theories about how the world works. If a study generates null results, researchers must again engage in theory building to explain their findings. Talking to implementing partners and study participants or conducting additional statistical analyses (like balance tests or tests for heterogeneous treatment effects) might help researchers identify or rule out some explanations. After this process, researchers can design additional studies to test new explanations and theories. + +# References {.unnumbered .unlisted} -# References \ No newline at end of file From 31a7c87b03b13d7cdfb5b5c54756f3d252138148 Mon Sep 17 00:00:00 2001 From: nahomi Date: Thu, 14 Mar 2024 20:55:06 -0400 Subject: [PATCH 2/3] edits to null results MG --- guides/interpretation/null-results.bib | 11 +- guides/interpretation/null-results_en.html | 417 +++++++++++++++++++++ 2 files changed, 427 insertions(+), 1 deletion(-) create mode 100644 guides/interpretation/null-results_en.html diff --git a/guides/interpretation/null-results.bib b/guides/interpretation/null-results.bib index 0bfedd5..dbf8426 100644 --- a/guides/interpretation/null-results.bib +++ b/guides/interpretation/null-results.bib @@ -15,7 +15,16 @@ @article{adida_2020 abstract = {Scholars argue that access to information about a politician’s programmatic performance helps voters reward good performers and punish poor ones. But in places where resources are made conditional on collective electoral behavior, voters may not want to defect to vote for a strong legislative performer if they do not believe that others will. We argue that two conditions must hold for information about politician performance to affect voter behavior: Voters must care about the information and believe that others in their constituency care as well. In a field experiment around legislative elections in Benin, voters rewarded good programmatic performance only when information was both made relevant to voters and widely disseminated within the electoral district. Otherwise, access to positive legislative performance information actually lowered vote share for the incumbent’s party. These results demonstrate the joint importance of Salience and voter coordination in shaping information’s impact in clientelistic democracies.}, langid = {english} } - +@article{humphreys_exporting_2019, + title = {Exporting Democratic Practices: {{Evidence}} from a Village Governance Intervention in {{Eastern Congo}}}, + author = {Humphreys, Macartan and {de la Sierra}, Ra{\'u}l S{\'a}nchez and {van der Windt}, Peter}, + year = {2019}, + journal = {Journal of Development Economics}, + volume = {140}, + pages = {279--301}, + issn = {0304-3878}, + doi = {10.1016/j.jdeveco.2019.03.011} + } @article{arel-bundock_2023, title = {Quantitative {{Political Science Research}} Is {{Greatly Underpowered}}}, author = {Arel-Bundock, Vincent and Briggs, Ryan and Doucouliagos, Hristos and Aviña, Marco M. and Stanley, T. D.}, diff --git a/guides/interpretation/null-results_en.html b/guides/interpretation/null-results_en.html new file mode 100644 index 0000000..2670481 --- /dev/null +++ b/guides/interpretation/null-results_en.html @@ -0,0 +1,417 @@ + + + + + + + + + + +10 Things Your Null Results Might Mean + + + + + + + + + + + + + + + + + + + +
+ +
+ +
+
+

10 Things Your Null Results Might Mean

+
+ + + +
+ +
+
Author
+ +
+ + + +
+ + +
+ +

A null result is when a hypothesis test indicates that there is not enough evidence to say an intervention (treatment) changed outcomes in a study. Null results might occur because the intervention truly has no effect or because there is not enough information to detect an effect that exists.

+
+

A null result indicates that a study did not generate evidence to conclude that an intervention changed outcomes.

+

There may not be evidence either because the intervention does not, in fact, change outcomes or because the study failed to gather adequate evidence of an effect that actually exists. Null results, like all other findings, are a function of both how the world works and the research design and statistical methods used to learn about the world.
+

+
+
+

Well-designed studies with null effects are important contributions to knowledge about the world.

+

Researchers sometimes think of null results as neither interesting nor useful, but it is important to make publicly available results from all experiments. Studies with null results are underrepresented in published research.1 When studies with null results are not disseminated, other researchers might conduct similar studies thinking they are exploring new ground instead of directing their time and resources in more fruitful directions. The under-representation of null results also means that impression of effect sizes given by the published literature is too large.

+
+
+

Sometimes null results are an artifact of true zero effects:

+
+
+

a. Null results might reflect that an intervention in fact does not move outcomes.

+

If the intervention does not work, is too weak, or if the outcomes of interest are resistant to change, an intervention may simply not affect outcomes. For example, Metaketa I theorized that providing citizens with information about incumbent performance would enhance political accountability. In Benin, one research team found that a light-touch information intervention did not change voter behavior (Adida et al. 2020).

+ +
+
+

b. However, the intervention may work in other contexts - the null result might not generalize.

+

The same study carried out in two different places or at two different times can generate different results. Sometimes an intervention that is ineffective in one setting will work in other contexts. Because of this, researchers should not conclude an intervention can never change outcomes on the basis of one study. For example, several studies in Africa tested whether community-based monitoring of health services improve healthcare uptake and outcomes. Björkman and Svensson (2009) initially found promising results in Uganda. Ten years later, a Sierra Leonean study also found promising results (Christensen et al. 2021) , but a Ugandan replication study had largely null findings.

+ +
+
+

c. Null effects might result from some units reponding positively and other units negatively to the intervention.

+ +

Randomized experiments generally focus on average treatment effects. However, the average treatment effect might mask important variation in effects across units within the study. Positive effects among some units may cancel out negative effects among other units, producing an average treatment effect indistinguishable from zero. For example, some respondents in an experiment in the United States felt more warmly toward the candidate after learning about a political candidate’s partisanship, while others felt more cold toward the candidate. The direction of the effect depended on the respondent’s own partisanship (Lelkes 2021). Studies that fail to take into account how the direction of the effect depends on characteristics of individual subjects may generate null effects overall, even though the treatment shifted outcomes for many units. Even a larger sample size would have produced a null finding. To learn more about heterogeneous treatment effects, see 10 Things to Know about Heterogeneous Treatment Effects.

+
+
+

Sometimes null results are an artifact of research design: Null results might reflect an underpowered research design.

+

A research design is under-powered when design features undermine the ability to reliably detect a true effect. Inadequate power is a ubiquitous problem in social science research. In a recent working paper reviewing 16,000 hypothesis tests from 2,000 political science articles, Arel-Bundock et al. (2023) found that even with generous assumptions the median study had only 10% power. In other words, studies of an intervention with a true effect will generate null results nine out of ten times simply because of insufficient sample size. Only 10% of the studies in the review were powered at 80%, a commonly-used threshold for adequate power.

+

Features of research design may contribute to the lack of statistical power. First, measurement strategies may not be sensitive enough to capture the changes that occurred. For example, consider whether exercise improves cardiovascular health in 6 months. It will be easier to detect improvements in cardiovascular health through finer measures like resting heart rate than through more blunt measures like whether the individual died. To learn more about measurement, see 10 Things to Know About Measurement in Experiments.

+

Second, research designs with small samples can produce null results even with sensitive measurement strategies. More units are needed to distinguish between a small average effect and a true zero effect than between a large average effect and a true zero effect. To learn more about statistical power, see 10 Things to Know about Statistical Power.

+
+
+

Sometimes null results because designs interact with the real world in unexpected ways:

+
+
+

a. Null results might result from treatments or their effects spilling over from units in one experimental condition to another.

+

The intervention or its effects may spill over to units that were not assigned to receive that intervention. For example, if an intervention provides cash transfers to treated units, treated units may share the cash with other units not assigned to that intervention. If an intervention reduces racial prejudice, social connections among units may create shifts in racial attitudes and norms among units assigned to control as well as those assigned to treatment. Although the intervention produces real changes in these situations, the spillovers from the intervention targets to other units makes the changes difficult to detect when comparing study units in one condition to one another. Some research designs anticipate and measure spillovers. To learn more about spillovers, see 10 Things to Know About Spillovers.

+ +
+
+

b. Null results might reflect incomplete implementation of the intervention.

+

Units randomly assigned to control may inadvertently receive treatment or units assigned to treatment might not receive it. For example, one study aimed to examine the effects of conditional cash transfer programs to improve school attendance. In the control condition, participants should have received unconditional cash transfers regardless of school attendance. However, the government implementing the transfer programs required families to enroll children in school in order to receive the “unconditional” funds, thus making the control group similar to the treatment group. This can bias results toward null findings even if there is actually an effect.

+ + +
+
+

c. Null results might reflect differential attrition.

+

An intervention or its effects may cause treated units to drop out of the study. If units where the treatment was effective are less likely to complete the study, such as when a successful training program might lead a subject to move outside the study area for work, then the study will likely underestimate the treatment’s impacts. Differential attrition may then explain null results. To learn more about attrition and see sample code, see 10 Things to Know About Missing Data.

+
+
+

Understanding why an intervention did not work is difficult.

+

Understanding why an intervention didn’t work as expected can be harder than understanding why it did. Researchers design studies based on their model of how the world works. If a study generates null results, researchers should first investigate whether this may be due to failures in research design, as Humphreys, de la Sierra, and van der Windt (2019) do. Researchers can then engage in theory-building to explain their findings. Talking to implementing partners and study participants or conducting additional statistical analyses like balance tests or tests for heterogeneous treatment effects might help researchers identify or rule out some explanations. After this process, researchers can design additional studies to test new explanations and theories.

+
+
+ + +
+ + +

References

+
+Adida, Claire, Jessica Gottlieb, Eric Kramon, and Gwyneth McClendon. 2020. “When Does Information Influence Voters? The Joint Importance of Salience and Coordination.” Comparative Political Studies 53 (6): 851–91. https://doi.org/10.1177/0010414019879945. +
+
+Arel-Bundock, Vincent, Ryan Briggs, Hristos Doucouliagos, Marco M. Aviña, and T. D. Stanley. 2023. “Quantitative Political Science Research Is Greatly Underpowered,” December. https://doi.org/10.31219/osf.io/7vy2f. +
+
+Björkman, Martina, and Jakob Svensson. 2009. “Power to the People: Evidence from a Randomized Field Experiment on Community-Based Monitoring in Uganda.” The Quarterly Journal of Economics 124 (2): 735–69. http://www.jstor.org/stable/40506242. +
+
+Christensen, Darin, Oeindrila Dube, Johannes Haushofer, Bilal Siddiqi, and Maarten Voors. 2021. “Building Resilient Health Systems: Experimental Evidence from Sierra Leone and The 2014 Ebola Outbreak*.” The Quarterly Journal of Economics 136 (2): 1145–98. https://doi.org/10.1093/qje/qjaa039. +
+
+Franco, Annie, Neil Malhotra, and Gabor Simonovits. 2014. “Publication Bias in the Social Sciences: Unlocking the File Drawer.” Science 345 (6203): 1502–5. https://doi.org/10.1126/science.1255484. +
+
+Humphreys, Macartan, Raúl Sánchez de la Sierra, and Peter van der Windt. 2019. “Exporting Democratic Practices: Evidence from a Village Governance Intervention in Eastern Congo.” Journal of Development Economics 140: 279–301. https://doi.org/10.1016/j.jdeveco.2019.03.011. +
+
+Lelkes, Yphtach. 2021. “Policy over Party: Comparing the Effects of Candidate Ideology and Party on Affective Polarization.” Political Science Research and Methods 9 (1): 189–96. https://doi.org/10.1017/psrm.2019.18. +
+

Footnotes

+ +
    +
  1. To learn more about publication bias in the social sciences, see Franco, Malhotra, and Simonovits (2014).↩︎

  2. +
+
+ + +
+ + + + \ No newline at end of file From 42626d4984f176aaae8dd212b04c1872524b7527 Mon Sep 17 00:00:00 2001 From: nahomi Date: Thu, 14 Mar 2024 21:06:15 -0400 Subject: [PATCH 3/3] null results MG qmd file --- guides/interpretation/null-results_en.html | 417 --------------------- guides/interpretation/null-results_en.qmd | 71 ++-- 2 files changed, 40 insertions(+), 448 deletions(-) delete mode 100644 guides/interpretation/null-results_en.html diff --git a/guides/interpretation/null-results_en.html b/guides/interpretation/null-results_en.html deleted file mode 100644 index 2670481..0000000 --- a/guides/interpretation/null-results_en.html +++ /dev/null @@ -1,417 +0,0 @@ - - - - - - - - - - -10 Things Your Null Results Might Mean - - - - - - - - - - - - - - - - - - - -
- -
- -
-
-

10 Things Your Null Results Might Mean

-
- - - -
- -
-
Author
- -
- - - -
- - -
- -

A null result is when a hypothesis test indicates that there is not enough evidence to say an intervention (treatment) changed outcomes in a study. Null results might occur because the intervention truly has no effect or because there is not enough information to detect an effect that exists.

-
-

A null result indicates that a study did not generate evidence to conclude that an intervention changed outcomes.

-

There may not be evidence either because the intervention does not, in fact, change outcomes or because the study failed to gather adequate evidence of an effect that actually exists. Null results, like all other findings, are a function of both how the world works and the research design and statistical methods used to learn about the world.
-

-
-
-

Well-designed studies with null effects are important contributions to knowledge about the world.

-

Researchers sometimes think of null results as neither interesting nor useful, but it is important to make publicly available results from all experiments. Studies with null results are underrepresented in published research.1 When studies with null results are not disseminated, other researchers might conduct similar studies thinking they are exploring new ground instead of directing their time and resources in more fruitful directions. The under-representation of null results also means that impression of effect sizes given by the published literature is too large.

-
-
-

Sometimes null results are an artifact of true zero effects:

-
-
-

a. Null results might reflect that an intervention in fact does not move outcomes.

-

If the intervention does not work, is too weak, or if the outcomes of interest are resistant to change, an intervention may simply not affect outcomes. For example, Metaketa I theorized that providing citizens with information about incumbent performance would enhance political accountability. In Benin, one research team found that a light-touch information intervention did not change voter behavior (Adida et al. 2020).

- -
-
-

b. However, the intervention may work in other contexts - the null result might not generalize.

-

The same study carried out in two different places or at two different times can generate different results. Sometimes an intervention that is ineffective in one setting will work in other contexts. Because of this, researchers should not conclude an intervention can never change outcomes on the basis of one study. For example, several studies in Africa tested whether community-based monitoring of health services improve healthcare uptake and outcomes. Björkman and Svensson (2009) initially found promising results in Uganda. Ten years later, a Sierra Leonean study also found promising results (Christensen et al. 2021) , but a Ugandan replication study had largely null findings.

- -
-
-

c. Null effects might result from some units reponding positively and other units negatively to the intervention.

- -

Randomized experiments generally focus on average treatment effects. However, the average treatment effect might mask important variation in effects across units within the study. Positive effects among some units may cancel out negative effects among other units, producing an average treatment effect indistinguishable from zero. For example, some respondents in an experiment in the United States felt more warmly toward the candidate after learning about a political candidate’s partisanship, while others felt more cold toward the candidate. The direction of the effect depended on the respondent’s own partisanship (Lelkes 2021). Studies that fail to take into account how the direction of the effect depends on characteristics of individual subjects may generate null effects overall, even though the treatment shifted outcomes for many units. Even a larger sample size would have produced a null finding. To learn more about heterogeneous treatment effects, see 10 Things to Know about Heterogeneous Treatment Effects.

-
-
-

Sometimes null results are an artifact of research design: Null results might reflect an underpowered research design.

-

A research design is under-powered when design features undermine the ability to reliably detect a true effect. Inadequate power is a ubiquitous problem in social science research. In a recent working paper reviewing 16,000 hypothesis tests from 2,000 political science articles, Arel-Bundock et al. (2023) found that even with generous assumptions the median study had only 10% power. In other words, studies of an intervention with a true effect will generate null results nine out of ten times simply because of insufficient sample size. Only 10% of the studies in the review were powered at 80%, a commonly-used threshold for adequate power.

-

Features of research design may contribute to the lack of statistical power. First, measurement strategies may not be sensitive enough to capture the changes that occurred. For example, consider whether exercise improves cardiovascular health in 6 months. It will be easier to detect improvements in cardiovascular health through finer measures like resting heart rate than through more blunt measures like whether the individual died. To learn more about measurement, see 10 Things to Know About Measurement in Experiments.

-

Second, research designs with small samples can produce null results even with sensitive measurement strategies. More units are needed to distinguish between a small average effect and a true zero effect than between a large average effect and a true zero effect. To learn more about statistical power, see 10 Things to Know about Statistical Power.

-
-
-

Sometimes null results because designs interact with the real world in unexpected ways:

-
-
-

a. Null results might result from treatments or their effects spilling over from units in one experimental condition to another.

-

The intervention or its effects may spill over to units that were not assigned to receive that intervention. For example, if an intervention provides cash transfers to treated units, treated units may share the cash with other units not assigned to that intervention. If an intervention reduces racial prejudice, social connections among units may create shifts in racial attitudes and norms among units assigned to control as well as those assigned to treatment. Although the intervention produces real changes in these situations, the spillovers from the intervention targets to other units makes the changes difficult to detect when comparing study units in one condition to one another. Some research designs anticipate and measure spillovers. To learn more about spillovers, see 10 Things to Know About Spillovers.

- -
-
-

b. Null results might reflect incomplete implementation of the intervention.

-

Units randomly assigned to control may inadvertently receive treatment or units assigned to treatment might not receive it. For example, one study aimed to examine the effects of conditional cash transfer programs to improve school attendance. In the control condition, participants should have received unconditional cash transfers regardless of school attendance. However, the government implementing the transfer programs required families to enroll children in school in order to receive the “unconditional” funds, thus making the control group similar to the treatment group. This can bias results toward null findings even if there is actually an effect.

- - -
-
-

c. Null results might reflect differential attrition.

-

An intervention or its effects may cause treated units to drop out of the study. If units where the treatment was effective are less likely to complete the study, such as when a successful training program might lead a subject to move outside the study area for work, then the study will likely underestimate the treatment’s impacts. Differential attrition may then explain null results. To learn more about attrition and see sample code, see 10 Things to Know About Missing Data.

-
-
-

Understanding why an intervention did not work is difficult.

-

Understanding why an intervention didn’t work as expected can be harder than understanding why it did. Researchers design studies based on their model of how the world works. If a study generates null results, researchers should first investigate whether this may be due to failures in research design, as Humphreys, de la Sierra, and van der Windt (2019) do. Researchers can then engage in theory-building to explain their findings. Talking to implementing partners and study participants or conducting additional statistical analyses like balance tests or tests for heterogeneous treatment effects might help researchers identify or rule out some explanations. After this process, researchers can design additional studies to test new explanations and theories.

-
-
- - -
- - -

References

-
-Adida, Claire, Jessica Gottlieb, Eric Kramon, and Gwyneth McClendon. 2020. “When Does Information Influence Voters? The Joint Importance of Salience and Coordination.” Comparative Political Studies 53 (6): 851–91. https://doi.org/10.1177/0010414019879945. -
-
-Arel-Bundock, Vincent, Ryan Briggs, Hristos Doucouliagos, Marco M. Aviña, and T. D. Stanley. 2023. “Quantitative Political Science Research Is Greatly Underpowered,” December. https://doi.org/10.31219/osf.io/7vy2f. -
-
-Björkman, Martina, and Jakob Svensson. 2009. “Power to the People: Evidence from a Randomized Field Experiment on Community-Based Monitoring in Uganda.” The Quarterly Journal of Economics 124 (2): 735–69. http://www.jstor.org/stable/40506242. -
-
-Christensen, Darin, Oeindrila Dube, Johannes Haushofer, Bilal Siddiqi, and Maarten Voors. 2021. “Building Resilient Health Systems: Experimental Evidence from Sierra Leone and The 2014 Ebola Outbreak*.” The Quarterly Journal of Economics 136 (2): 1145–98. https://doi.org/10.1093/qje/qjaa039. -
-
-Franco, Annie, Neil Malhotra, and Gabor Simonovits. 2014. “Publication Bias in the Social Sciences: Unlocking the File Drawer.” Science 345 (6203): 1502–5. https://doi.org/10.1126/science.1255484. -
-
-Humphreys, Macartan, Raúl Sánchez de la Sierra, and Peter van der Windt. 2019. “Exporting Democratic Practices: Evidence from a Village Governance Intervention in Eastern Congo.” Journal of Development Economics 140: 279–301. https://doi.org/10.1016/j.jdeveco.2019.03.011. -
-
-Lelkes, Yphtach. 2021. “Policy over Party: Comparing the Effects of Candidate Ideology and Party on Affective Polarization.” Political Science Research and Methods 9 (1): 189–96. https://doi.org/10.1017/psrm.2019.18. -
-

Footnotes

- -
    -
  1. To learn more about publication bias in the social sciences, see Franco, Malhotra, and Simonovits (2014).↩︎

  2. -
-
- - -
- - - - \ No newline at end of file diff --git a/guides/interpretation/null-results_en.qmd b/guides/interpretation/null-results_en.qmd index c1c38e7..0a5351a 100644 --- a/guides/interpretation/null-results_en.qmd +++ b/guides/interpretation/null-results_en.qmd @@ -3,69 +3,78 @@ title: "10 Things Your Null Results Might Mean" author: - name: "Jennifer A. Hamilton" url: https://sites.google.com/view/jennifer-a-hamilton/ -abstract: "A null result is when a hypothesis test indicates that there is not enough evidence to say an intervention (treatment) changed outcomes in a study. Null results might occur because the intervention truly has no effect or because there is not enough information to tell that an effect exists." -bibliography: null_results.bib -format: docx -always_allow_html: yes +bibliography: null-results.bib --- +A null result is when a [hypothesis test](https://methods.egap.org/guides/analysis-procedures/hypothesis-testing_en.html) indicates that there is not enough evidence to say an intervention (treatment) changed outcomes in a study. Null results might occur because the intervention truly has no effect or because there is not enough information to detect an effect that exists. + # A null result indicates that a study did not generate evidence to conclude that an intervention changed outcomes. -There may not be evidence either because the intervention does not, in fact, change outcomes or because the study failed to gather adequate evidence of an effect that actually exists. Findings, including null results, are a function of both how the world works and the approaches used to learn about it. +There may not be evidence either because the intervention does not, in fact, change outcomes or because the study failed to gather adequate evidence of an effect that actually exists. Null results, like all other findings, are a function of both how the world works and the research design and statistical methods used to learn about the world. + -To learn more about hypothesis testing and see sample code, see [10 Things to Know About Hypothesis Testing](https://methods.egap.org/guides/analysis-procedures/hypothesis-testing_en.html). +# Well-designed studies with null effects are important contributions to knowledge about the world. -# Well-designed studies with null effects are important contributions to expanding knowledge and understanding of the world. +Researchers sometimes think of null results as neither interesting nor useful, but it is important to make publicly available results from all experiments. Studies with null results are underrepresented in published research.[^1] When studies with null results are not disseminated, other researchers might conduct similar studies thinking they are exploring new ground instead of directing their time and resources in more fruitful directions. The under-representation of null results also means that impression of effect sizes given by the published literature is too large. -Researchers sometimes think of null results as neither interesting nor useful. However, when studies with null results are not published,[^1] researchers might waste limited time and resources conducting similar studies. The underrepresentation of null results also means that the published literature tends to overestimate true effect sizes. It is therefore important for researchers to write up and make publicly available results from each experiment they conduct. +[^1]: To learn more about *publication bias* in the social sciences, see @franco_2014. -[^1]: Currently, studies with null results are underrepresented in published research. To learn more about *publication bias* in the social sciences, see @franco_2014. +# Sometimes null results are an artifact of true zero effects: -# Sometimes null results are an artifact of true zero effects: null results might reflect that an intervention in fact does not move outcomes. +# a. Null results might reflect that an intervention in fact does not move outcomes. If the intervention does not work, is too weak, or if the outcomes of interest are resistant to change, an intervention may simply not affect outcomes. For example, [Metaketa I](https://egap.org/our-work/the-metaketa-initiative/round1-information-accountability/) theorized that providing citizens with information about incumbent performance would enhance political accountability. In Benin, one research team found that a light-touch information intervention did not change voter behavior [@adida_2020]. -# Sometimes null results are an artifact of true zero effects: null results might not generalize: the intervention truly does not work in the study context but it may work in others + +# b. However, the intervention may work in other contexts - the null result might not generalize. -Context matters! The same study carried out in two different places or at two different times can generate different results. Sometimes an intervention that is ineffective in one setting will work in other contexts. Because of this, researchers should not conclude an intervention can never change outcomes on the basis of one study. +The same study carried out in two different places or at two different times can generate different results. Sometimes an intervention that is ineffective in one setting will work in other contexts. Because of this, researchers should not conclude an intervention can never change outcomes on the basis of one study. For example, several studies in Africa tested whether community-based monitoring of health services improve healthcare uptake and outcomes. @bjorkman_2009 initially found promising results in Uganda. Ten years later, a Sierra Leonean study also found promising results [@christensen_2021] , but a [Ugandan replication study](https://egap.org/resource/does-bottom-up-accountability-work-evidence-from-uganda/) had largely null findings. + + +# c. Null effects might result from some units reponding positively and other units negatively to the intervention. + -One illustration of this scenario is a series of studies in Africa testing whether community-based monitoring of health services improve healthcare uptake and outcomes. One initial study in Uganda found promising results [@bjorkman_2009]. Ten years later, two additional teams replicated the intervention in Sierra Leone [@christensen_2021] and [Uganda](https://egap.org/resource/does-bottom-up-accountability-work-evidence-from-uganda/). The Sierra Leonean study also found promising results. In contrast, the Ugandan replication study had largely null findings. +Randomized experiments generally focus on average treatment effects. However, the average treatment effect might mask important variation in effects across units within the study. Positive effects among some units may cancel out negative effects among other units, producing an average treatment effect indistinguishable from zero. For example, some respondents in an experiment in the United States felt more warmly toward the candidate after learning about a political candidate's partisanship, while others felt more cold toward the candidate. The direction of the effect depended on the respondent's own partisanship [@lelkes_2021]. Studies that fail to take into account how the direction of the effect depends on characteristics of individual subjects may generate null effects overall, even though the treatment shifted outcomes for many units. Even a larger sample size would have produced a null finding. To learn more about heterogeneous treatment effects, see [10 Things to Know about Heterogeneous Treatment Effects](https://methods.egap.org/guides/research-questions/heterogeneous-effects_en.html). -# Sometimes null results are an artifact of true zero effects: null effects might reflect opposite reactions to the intervention by different units in the study. +# Sometimes null results are an artifact of research design: Null results might reflect an underpowered research design. -Studies often focus on average treatment effects. However, the average treatment effect might mask important variation in effects across units within the study. Positive effects among some units may cancel out negative effects among other units, producing an average treatment effect indistinguishable from zero. For example, some respondents in an experiment in the United States felt more warmly toward the candidate after learning about a political candidate's partisanship, while others felt more cold toward the candidate. The direction of the effect depended on the respondent's own partisanship [@lelkes_2021]. Studies that fail to take into account how the direction of the effect depends on unit characteristics may generate null effects overall, even though the treatment shifted outcomes for many units. Even a larger sample size would have produced a null average treatment effect. +A research design is under-powered when design features undermine the ability to reliably detect a true effect. Inadequate power is a ubiquitous problem in social science research. In a recent working paper reviewing 16,000 hypothesis tests from 2,000 political science articles, @arel-bundock_2023 found that even with generous assumptions the median study had only 10% power. In other words, studies of an intervention with a true effect will generate null results nine out of ten times simply because of insufficient sample size. Only 10% of the studies in the review were powered at 80%, a commonly-used threshold for adequate power. -To learn more about heterogeneous treatment effects and see sample code, see [10 Things to Know about Heterogeneous Treatment Effects](https://methods.egap.org/guides/research-questions/heterogeneous-effects_en.html). +Features of research design may contribute to the lack of statistical power. First, + +measurement strategies may not be sensitive enough to capture the changes that occurred. For example, consider whether exercise improves cardiovascular health in 6 months. It will be easier to detect improvements in cardiovascular health through finer measures like resting heart rate than through more blunt measures like whether the individual died. To learn more about measurement, see [10 Things to Know About Measurement in Experiments](https://methods.egap.org/guides/data-strategies/measurement_en.html). + -# Sometimes null results are an artifact of research design: null results might reflect an underpowered research design. +Second, research designs with small samples can produce null results even with sensitive measurement strategies. More units are needed to distinguish between a small average effect and a true zero effect than between a large average effect and a true zero effect. To learn more about statistical power, see [10 Things to Know about Statistical Power](https://methods.egap.org/guides/assessing-designs/power_en.html). + -A research design is underpowered when design features undermine the ability to reliably detect a true effect. Inadequate power is a ubiquitous problem in social science research. In a recent working paper reviewing 16,000 hypothesis tests from 2,000 political science articles, @arel-bundock_2023 found that the median study has only 10% power. In other words, studies of an intervention with a true effect will generate null results nine out of ten times simply because of insufficient sample size. Only 10% of the studies in the review were powered at 80%, a commonly-used threshold for minimum levels of power. Several features of research design may contribute to lack of statistical power. To learn more about statistical power and see sample code, see [10 Things to Know about Statistical Power](https://methods.egap.org/guides/assessing-designs/power_en.html). +# Sometimes null results because designs interact with the real world in unexpected ways: -First, research designs with imprecise measurement strategies are more likely to yield null results. Measurement strategies may not be sensitive enough to capture the changes that occurred. For example, consider a study examining whether exercise improves cardiovascular health. In the short term, it will be easier to detect improvements in cardiovascular health through more finer measures like resting heart rate compared to more blunt measures like whether the individual died. To learn more about indices and see sample code, see [10 Things to Know About Indices](https://methods.egap.org/guides/data-strategies/indices_en.html). To learn more about measurement generally, see [10 Things to Know About Measurement in Experiments](https://methods.egap.org/guides/data-strategies/measurement_en.html). +# a. Null results might result from treatments or their effects spilling over from units in one experimental condition to another. -Second, research designs with small samples can produce null results even with sensitive measurement strategies. More units are needed to distinguish between a small average effect and a true zero effect than between a large average effect and a true zero effect. An effect may be small on average if an intervention produces small effects among many treated units or if it produces larger effects among only a few treated units. +The intervention or its effects may spill over to units that were not assigned to receive that intervention. For example, if an intervention provides cash transfers to treated units, treated units may share the cash with other units not assigned to that intervention. If an intervention reduces racial prejudice, social connections among units may create shifts in racial attitudes and norms among units assigned to control as well as those assigned to treatment. Although the intervention produces real changes in these situations, the spillovers from the intervention targets to other units makes the changes difficult to detect when comparing study units in one condition to one another. Some research designs anticipate and measure spillovers. To learn more about spillovers, see [10 Things to Know About Spillovers](https://methods.egap.org/guides/data-strategies/spillovers_en.html). -# Sometimes null results because designs interact with the real world in unexpected ways: null results might reflect because treatments or their effects spill over from units in one experimental condition to another. + -The intervention or its effects may spill over to units that were not assigned to receive that intervention. For example, if an intervention provides cash transfers to treated units, treated units may share the cash with other units not assigned to that intervention. If an intervention reduces racial prejudice, social connections among units may create shifts in racial attitudes and norms among units assigned to control as well as those assigned to treatment. Although the intervention produces real changes in these situations, the spillovers from the intervention targets to other units makes the changes difficult to detect when comparing study units to one another. +# b. Null results might reflect incomplete implementation of the intervention. -Some research designs anticipate and measure spillovers. To learn more about spillovers, see [10 Things to Know About Spillovers](https://methods.egap.org/guides/data-strategies/spillovers_en.html). +Units randomly assigned to control may inadvertently receive treatment or units assigned to treatment might not receive it. + +For example, one [study](https://www.gsb.stanford.edu/insights/everything-can-go-wrong-field-experiment-what-do-about-it) aimed to examine the effects of conditional cash transfer programs to improve school attendance. In the control condition, participants should have received unconditional cash transfers regardless of school attendance. However, the government implementing the transfer programs required families to enroll children in school in order to receive the "unconditional" funds, thus making the control group similar to the treatment group. This can bias results toward null findings even if there is actually an effect. -# Sometimes null results because designs interact with the real world in unexpected ways: null results might reflect incomplete intervention implementation. + -Designated control units may inadvertently receive treatment or units assigned to treatment might not receive it.[^2] For example, one [study](https://www.gsb.stanford.edu/insights/everything-can-go-wrong-field-experiment-what-do-about-it) aimed to examine the effects of conditional cash transfer programs to improve school attendance. In the control condition, participants should have received unconditional cash transfers regardless of school attendance. However, the government implementing the transfer programs required families to enroll children in school in order to receive the "unconditional" funds, thus making the control group similar to the treatment group. This dynamic can bias results toward null findings even if there is actually an effect. + -[^2]: To learn more about *non-compliance* (when whether a unit received treatment status does not match whether it was assigned to receive treatment), see [How to design and implement an experiment (life cycle)](https://methods.egap.org/guides/data-strategies/how-to_en.html). +# c. Null results might reflect differential attrition. -# Sometimes null results because designs interact with the real world in unexpected ways: null results might reflect differential attrition. +An intervention or its effects may cause treated units to drop out of the study. If units where the treatment was effective are less likely to complete the study, such as when a successful training program might lead a subject to move outside the study area for work, then the study will likely underestimate the treatment's impacts. Differential attrition may then explain null results. To learn more about attrition and see sample code, see [10 Things to Know About Missing Data](https://methods.egap.org/guides/data-strategies/missing-data_en.html). -An intervention or its effects may cause treated units to drop out of the study. If units where the treatment was effective are less likely to complete the study, then the study will likely underestimate the treatment's impacts. Differential attrition may then explain null results. +# Understanding why an intervention did not work is difficult. -To learn more about attrition and see sample code, see [10 Things to Know About Missing Data](https://methods.egap.org/guides/data-strategies/missing-data_en.html). +Understanding why an intervention didn't work as expected can be harder than understanding why it did. Researchers design studies based on their model of how the world works. If a study generates null results, researchers should first investigate whether this may be due to failures in research design, as @humphreys_exporting_2019 do. Researchers can then engage in theory-building to explain their findings. Talking to implementing partners and study participants or conducting additional statistical analyses like balance tests or tests for heterogeneous treatment effects might help researchers identify or rule out some explanations. After this process, researchers can design additional studies to test new explanations and theories. -# Understanding why an intervention did not work is hard. -Understanding why an intervention didn't work as expected can be harder than understanding why it did. Researchers design studies to confirm theories about how the world works. If a study generates null results, researchers must again engage in theory building to explain their findings. Talking to implementing partners and study participants or conducting additional statistical analyses (like balance tests or tests for heterogeneous treatment effects) might help researchers identify or rule out some explanations. After this process, researchers can design additional studies to test new explanations and theories. # References {.unnumbered .unlisted}