diff --git a/guides/interpretation/null-results.bib b/guides/interpretation/null-results.bib index cf9cc67..dbf8426 100644 --- a/guides/interpretation/null-results.bib +++ b/guides/interpretation/null-results.bib @@ -1,42 +1,110 @@ -%% This BibTeX bibliography file was created using BibDesk. -%% https://bibdesk.sourceforge.io/ +@article{adida_2020, + title = {When {{Does Information Influence Voters}}? {{The Joint Importance}} of {{Salience}} and {{Coordination}}}, + shorttitle = {When {{Does Information Influence Voters}}?}, + author = {Adida, Claire and Gottlieb, Jessica and Kramon, Eric and McClendon, Gwyneth}, + date = {2020-05-01}, + journaltitle = {Comparative Political Studies}, + volume = {53}, + number = {6}, + pages = {851--891}, + publisher = {{SAGE Publications Inc}}, + issn = {0010-4140}, + doi = {10.1177/0010414019879945}, + url = {https://doi.org/10.1177/0010414019879945}, + urldate = {2023-12-22}, + abstract = {Scholars argue that access to information about a politician’s programmatic performance helps voters reward good performers and punish poor ones. But in places where resources are made conditional on collective electoral behavior, voters may not want to defect to vote for a strong legislative performer if they do not believe that others will. We argue that two conditions must hold for information about politician performance to affect voter behavior: Voters must care about the information and believe that others in their constituency care as well. In a field experiment around legislative elections in Benin, voters rewarded good programmatic performance only when information was both made relevant to voters and widely disseminated within the electoral district. Otherwise, access to positive legislative performance information actually lowered vote share for the incumbent’s party. These results demonstrate the joint importance of Salience and voter coordination in shaping information’s impact in clientelistic democracies.}, + langid = {english} +} +@article{humphreys_exporting_2019, + title = {Exporting Democratic Practices: {{Evidence}} from a Village Governance Intervention in {{Eastern Congo}}}, + author = {Humphreys, Macartan and {de la Sierra}, Ra{\'u}l S{\'a}nchez and {van der Windt}, Peter}, + year = {2019}, + journal = {Journal of Development Economics}, + volume = {140}, + pages = {279--301}, + issn = {0304-3878}, + doi = {10.1016/j.jdeveco.2019.03.011} + } +@article{arel-bundock_2023, + title = {Quantitative {{Political Science Research}} Is {{Greatly Underpowered}}}, + author = {Arel-Bundock, Vincent and Briggs, Ryan and Doucouliagos, Hristos and Aviña, Marco M. and Stanley, T. D.}, + date = {2023-12-22}, + publisher = {{OSF}}, + doi = {10.31219/osf.io/7vy2f}, + url = {https://osf.io/7vy2f}, + urldate = {2023-12-22}, + abstract = {We analyze the statistical power of political science research by collating over 16,000 hypothesis tests from about 2,000 articles. Even with generous assumptions, the median analysis has about 10\% power, and only about 1 in 10 tests have at least 80\% power to detect the consensus effects reported in the literature. There is also substantial heterogeneity in tests across research areas, with some being characterized by high-power but most having very low power. To contextualize our findings, we survey political methodologists to assess their expectations about power levels. Most methodologists greatly overestimate the statistical power of political science research.}, + langid = {american}, + file = {C\:\\Users\\jham9\\Zotero\\storage\\Y6VJE9ZC\\Arel-Bundock et al. - 2023 - Quantitative Political Science Research is Greatly.pdf;C\:\\Users\\jham9\\Zotero\\storage\\FPG97PEI\\7vy2f.html} +} -%% Created for Matt Lisiecki at 2023-02-09 09:48:32 -0500 +@article{bjorkman_2009, + title = {Power to the {{People}}: {{Evidence}} from a {{Randomized Field Experiment}} on {{Community-Based Monitoring}} in {{Uganda}}}, + shorttitle = {Power to the {{People}}}, + author = {Björkman, Martina and Svensson, Jakob}, + date = {2009}, + journaltitle = {The Quarterly Journal of Economics}, + volume = {124}, + number = {2}, + eprint = {40506242}, + eprinttype = {jstor}, + pages = {735--769}, + issn = {0033-5533}, + url = {http://www.jstor.org/stable/40506242}, + urldate = {2018-03-07}, + abstract = {This paper presents a randomized field experiment on community-based monitoring of public primary health care providers in Uganda. Through two rounds of village meetings, localized nongovernmental organizations encouraged communities to be more involved with the state of health service provision and strengthened their capacity to hold their local health providers to account for performance. A year after the intervention, treatment communities are more involved in monitoring the provider, and the health workers appear to exert higher effort to serve the community. We document large increases in utilization and improved health outcomes—reduced child mortality and increased child weight—that compare favorably to some of the more successful community-based intervention trials reported in the medical literature.} +} +@article{christensen_2021, + title = {Building {{Resilient Health Systems}}: {{Experimental Evidence}} from {{Sierra Leone}} and {{The}} 2014 {{Ebola Outbreak}}*}, + shorttitle = {Building {{Resilient Health Systems}}}, + author = {Christensen, Darin and Dube, Oeindrila and Haushofer, Johannes and Siddiqi, Bilal and Voors, Maarten}, + date = {2021-05-01}, + journaltitle = {The Quarterly Journal of Economics}, + shortjournal = {The Quarterly Journal of Economics}, + volume = {136}, + number = {2}, + pages = {1145--1198}, + issn = {0033-5533}, + doi = {10.1093/qje/qjaa039}, + url = {https://doi.org/10.1093/qje/qjaa039}, + urldate = {2023-12-22}, + abstract = {Skepticism about the quality of health systems and their consequent underuse are thought to contribute to high rates of mortality in the developing world. The perceived quality of health services may be especially critical during epidemics, when people choose whether to cooperate with response efforts and frontline health workers. Can improving the perceived quality of health care promote community health and ultimately help to contain epidemics? We leverage a field experiment in Sierra Leone to answer this question in the context of the 2014 West African Ebola crisis. Two years before the outbreak, we randomly assigned two interventions to government-run health clinics—one focused on community monitoring, and the other conferred nonfinancial awards to clinic staff. Prior to the Ebola crisis, both interventions increased clinic utilization and patient satisfaction. Community monitoring additionally improved child health, leading to 38\% fewer deaths of children under age five. Later, during the crisis, the interventions also increased reporting of Ebola cases by 62\%, and community monitoring significantly reduced Ebola-related deaths. Evidence on mechanisms suggests that both interventions improved the perceived quality of health care, encouraging patients to report Ebola symptoms and receive medical care. Improvements in health outcomes under community monitoring suggest that these changes partly reflect a rise in the underlying quality of administered care. Overall, our results indicate that promoting accountability not only has the power to improve health systems during normal times, but can also make them more resilient to emergent crises.}, + file = {C\:\\Users\\jham9\\Zotero\\storage\\E5WNJFQX\\Christensen et al. - 2021 - Building Resilient Health Systems Experimental Ev.pdf;C\:\\Users\\jham9\\Zotero\\storage\\K8LDF37S\\5996193.html} +} -%% Saved with string encoding Unicode (UTF-8) +@article{franco_2014, + title = {Publication Bias in the Social Sciences: {{Unlocking}} the File Drawer}, + shorttitle = {Publication Bias in the Social Sciences}, + author = {Franco, Annie and Malhotra, Neil and Simonovits, Gabor}, + date = {2014-09-19}, + journaltitle = {Science}, + volume = {345}, + number = {6203}, + pages = {1502--1505}, + publisher = {{American Association for the Advancement of Science}}, + doi = {10.1126/science.1255484}, + url = {https://www.science.org/doi/full/10.1126/science.1255484}, + urldate = {2023-12-22}, + abstract = {We studied publication bias in the social sciences by analyzing a known population of conducted studies—221 in total—in which there is a full accounting of what is published and unpublished. We leveraged Time-sharing Experiments in the Social Sciences (TESS), a National Science Foundation–sponsored program in which researchers propose survey-based experiments to be run on representative samples of American adults. Because TESS proposals undergo rigorous peer review, the studies in the sample all exceed a substantial quality threshold. Strong results are 40 percentage points more likely to be published than are null results and 60 percentage points more likely to be written up. We provide direct evidence of publication bias and identify the stage of research production at which publication bias occurs: Authors do not write up and submit null findings.}, + file = {C:\Users\jham9\Zotero\storage\74ZBSQ5P\Franco et al. - 2014 - Publication bias in the social sciences Unlocking.pdf} +} - - -@article{bhatti_et_al_2016, - author = {Bhatti, Yosef and Dahlgaard, Jens Olav and Hansen, Jonas Hedegaard and Hansen, Kasper M.}, - date-added = {2023-02-09 09:46:29 -0500}, - date-modified = {2023-02-09 09:47:28 -0500}, - journal = {British Journal of Political Science}, - number = {1}, - pages = {279-290}, - title = {Is Door-to-Door Canvassing Effective in Europe? Evidence from a Meta-study across Six European Countries}, - volume = {49}, - year = {2016}} - -@article{bhatti_et_al_2018, - author = {Bhatti, Yosef and Dahlgaard, Jens Olav and Hansen, Jonas Hedegaard and Hansen, Kasper M.}, - date-added = {2023-02-09 09:42:58 -0500}, - date-modified = {2023-02-09 09:45:43 -0500}, - journal = {West European Politics}, - number = {1}, - pages = {240-260}, - title = { Can governments use Get Out The Vote letters to solve Europe's turnout crisis? Evidence from a field experiment}, - volume = {41}, - year = {2018}} - -@article{abadie_2020, - author = {Abadie, Alberto}, - date-added = {2023-02-09 09:41:33 -0500}, - date-modified = {2023-02-09 09:42:06 -0500}, - journal = {American Economic Review: Insights}, - number = {2}, - pages = {193-208}, - title = {Statistical Nonsignificance in Empirical Economics}, - volume = {2}, - year = {2020}} +@article{lelkes_2021, + title = {Policy over Party: Comparing the Effects of Candidate Ideology and Party on Affective Polarization}, + shorttitle = {Policy over Party}, + author = {Lelkes, Yphtach}, + date = {2021-01}, + journaltitle = {Political Science Research and Methods}, + volume = {9}, + number = {1}, + pages = {189--196}, + publisher = {{Cambridge University Press}}, + issn = {2049-8470, 2049-8489}, + doi = {10.1017/psrm.2019.18}, + url = {https://www.cambridge.org/core/journals/political-science-research-and-methods/article/abs/policy-over-party-comparing-the-effects-of-candidate-ideology-and-party-on-affective-polarization/7CE28F0E9763297A765263B1F774B7A1}, + urldate = {2023-12-22}, + abstract = {At least two theories have been offered that explain the rise of affective polarization. Some scholars, relying on social identity theory, argue that as the relevance of party identification increased, Americans became more likely to see their in-party in positive terms and the out-party in negative terms. Other scholars argue that affective polarization is a reaction to increasingly extreme political actors. This study seeks to arbitrate between these two theories of affective polarization through a survey experiment which asks respondents to rate candidates whose party (or lack thereof) and ideology (or lack thereof) is randomly assigned. In line with the policy-oriented view of affective polarization, respondents reacted far more strongly to ideology than party, especially if it was the ideology of the member of the out-party.}, + langid = {english}, + keywords = {Public opinion} +} diff --git a/guides/interpretation/null-results_en.qmd b/guides/interpretation/null-results_en.qmd index 3d0b3d6..0a5351a 100644 --- a/guides/interpretation/null-results_en.qmd +++ b/guides/interpretation/null-results_en.qmd @@ -1,99 +1,80 @@ --- -title: "10 Things Your Null Result Might Mean" +title: "10 Things Your Null Results Might Mean" author: - - name: "Rekha Balu" - url: https://cabs.mdrc.org/team/rekha-balu -bibliography: null-results.bib -image: null-results.png -abstract: | - After the excitement and hard work of running a field experiment is over, it’s not uncommon to hear policymakers and researchers express disappointment when they end up hearing that the intervention did not have a detectable impact. - This guide explains that a null result rarely means “the intervention didn’t work,” even though that tends to be the shorthand many people use. Instead, a null result can reflect the myriad design choices that policy implementers and researchers make in the course of developing and testing an intervention. After all, people tend to label [hypothesis tests](https://methods.egap.org/guides/analysis-procedures/hypothesis-testing_en.html) with high p-values as “null results”, and hypothesis tests (as summaries of information about design and data) can produce large p-values for many reasons. Policymakers can make better decisions about what to do with a null result when they understand how and why they got that result. + - name: "Jennifer A. Hamilton" + url: https://sites.google.com/view/jennifer-a-hamilton/ +bibliography: null-results.bib --- -Imagine you lead the department of education for a government and are wondering about how to boost student attendance. You decide to consider a text message intervention that offers individual students counseling. Counselors at each school can help students address challenges specifically related to school attendance. Your team runs a randomized trial of the intervention, and tells you there is a null result. +A null result is when a [hypothesis test](https://methods.egap.org/guides/analysis-procedures/hypothesis-testing_en.html) indicates that there is not enough evidence to say an intervention (treatment) changed outcomes in a study. Null results might occur because the intervention truly has no effect or because there is not enough information to detect an effect that exists. -How should you understand the null result, and what should you do about it? It could be a result of unmet challenges at several stages of your work -- in the way the intervention is designed, the way the intervention is implemented, or the way study is designed Below are 10 things to consider when interpreting your null result. - -Intervention Design -== +# A null result indicates that a study did not generate evidence to conclude that an intervention changed outcomes. -## 1. Your intervention theory and approach are mismatched to the problem. -You delivered a counseling intervention because you thought that students needed support to address challenges in their home life. However, students who had the greatest needs never actually met with a counselor, in part because they did not trust adults at the school. The theory of change assumed that absenteeism was a function primarily of a student’s personal decisions or family circumstances and that the offer of counseling without changes to school climate would be sufficient; it did not account appropriately for low levels of trust in teacher-student relationships. Therefore, this null effect does not suggest that counseling per se cannot boost attendance, but that counseling in the absence of other structural or policy changes or in the context of low-trust schools may not be sufficient. +There may not be evidence either because the intervention does not, in fact, change outcomes or because the study failed to gather adequate evidence of an effect that actually exists. Null results, like all other findings, are a function of both how the world works and the research design and statistical methods used to learn about the world. + -***How can you tell if***...you have a mismatch between your theory of change and the problem that needs to be solved? List all potential barriers and consider how they connect. Does the intervention as designed address only one of those barriers, and, if so, can it succeed without addressing others? Are there assumptions made about one source or one cause that may undermine the success of the intervention? +# Well-designed studies with null effects are important contributions to knowledge about the world. -## 2. Your intervention strength or dosage is too low for the problem or outcome of interest. -After talking to experts, you learn that counseling interventions can build trust, but usually require meetings that are more frequent and regular than your intervention offered to have the potential for an effect. Maybe your “dose” of services is too small. +Researchers sometimes think of null results as neither interesting nor useful, but it is important to make publicly available results from all experiments. Studies with null results are underrepresented in published research.[^1] When studies with null results are not disseminated, other researchers might conduct similar studies thinking they are exploring new ground instead of directing their time and resources in more fruitful directions. The under-representation of null results also means that impression of effect sizes given by the published literature is too large. -***How can you tell if***...you did not have a sufficient “dose”? Even if no existing services tackle your problem of interest, consider what is a minimum level, strength, or dose that is both feasible to implement and could yield an effect. When asking sites what they are willing to take on, beware of defaulting to the lowest dose. The more complex the problem or outcome is to move, the stronger or more comprehensive the intervention may need to be. +[^1]: To learn more about *publication bias* in the social sciences, see @franco_2014. -## 3. Your intervention does not represent a large enough enhancement over usual services. -In your position at the state department of education, you learn that students at the target schools were already receiving some counseling and support services. Even though the existing services were not sufficient to boost attendance to the targeted levels, the new intervention did not add enough content or frequency of the counseling services to reach those levels either---the intervention yielded show-up rates that were about the same as existing services. So this null effect does not reflect that counseling has no effect, but rather that the version of counseling your intervention offered was not effective over and above existing counseling services. +# Sometimes null results are an artifact of true zero effects: -***How can you tell if***...the relative strength of your intervention was not sufficient to yield an effect? Take stock of the structure and content of existing services, and consider if the extent or form in which clients respond to existing services indicates that the theory of change or approach needs to be revised. If the theory holds, use existing services as a benchmark and consider whether your proposed intervention needs to include something supplementary and/or something complementary. - - -Intervention Implementation -== -Programs rarely rollout exactly as intended, but some variations are more problematic than others. +# a. Null results might reflect that an intervention in fact does not move outcomes. -## 4. Your implementation format was not reliable. -In the schools in your study, counseling interventions sometimes occurred in person, sometimes happened by text message, sometimes by phone. Anticipating and allowing for some variation and adaptation is important. Intervention dosage and strength is often not delivered as designed nor to as many people as expected. +If the intervention does not work, is too weak, or if the outcomes of interest are resistant to change, an intervention may simply not affect outcomes. For example, [Metaketa I](https://egap.org/our-work/the-metaketa-initiative/round1-information-accountability/) theorized that providing citizens with information about incumbent performance would enhance political accountability. In Benin, one research team found that a light-touch information intervention did not change voter behavior [@adida_2020]. -But unplanned variations in format can reflect a host of selection bias issues, such that you cannot disentangle whether counseling as a concept does not work or whether certain formats of outreach did not work. This is especially important to guard against if you intend to test specific channels or mechanisms critical to your theory of change. + +# b. However, the intervention may work in other contexts - the null result might not generalize. -***How can you tell if***...an unreliable format is the reason for your null? Were you able to specify or standardize formats in a checklist? Could you leave enough discretion but still incentivize fidelity? Pre-specifying what the intervention should look like can help staff and researchers monitor along the way and correct inconsistencies or deviations that may affect the results. This could include a training protocol for those implementing the intervention. If nothing was specified or no one was trained, then the lack of consistency may be part of the explanation. +The same study carried out in two different places or at two different times can generate different results. Sometimes an intervention that is ineffective in one setting will work in other contexts. Because of this, researchers should not conclude an intervention can never change outcomes on the basis of one study. For example, several studies in Africa tested whether community-based monitoring of health services improve healthcare uptake and outcomes. @bjorkman_2009 initially found promising results in Uganda. Ten years later, a Sierra Leonean study also found promising results [@christensen_2021] , but a [Ugandan replication study](https://egap.org/resource/does-bottom-up-accountability-work-evidence-from-uganda/) had largely null findings. -## 5. Your intervention and outcome measure are mismatched to your randomization design. -You expected counseling to be more effective in schools with higher student-to-teacher ratios, but did not block randomize by class size (for more on block randomization, see our guide on [10 Things to Know About Randomization](https://egap.org/resource/10-things-to-know-about-randomization/)). then it may no longer have the potential to be more effective for students in high class size schools. + +# c. Null effects might result from some units reponding positively and other units negatively to the intervention. + -***How can you tell if***...you have a mismatch between your intervention and randomization design? Consider whether treatment effects could vary, or service delivery might occur in a [cluster](https://methods.egap.org/guides/data-strategies/cluster-randomization_en.html), or intervention concepts could [spill over](https://methods.egap.org/guides/data-strategies/spillovers_en.html), and to what extent your randomization design accounted for that. +Randomized experiments generally focus on average treatment effects. However, the average treatment effect might mask important variation in effects across units within the study. Positive effects among some units may cancel out negative effects among other units, producing an average treatment effect indistinguishable from zero. For example, some respondents in an experiment in the United States felt more warmly toward the candidate after learning about a political candidate's partisanship, while others felt more cold toward the candidate. The direction of the effect depended on the respondent's own partisanship [@lelkes_2021]. Studies that fail to take into account how the direction of the effect depends on characteristics of individual subjects may generate null effects overall, even though the treatment shifted outcomes for many units. Even a larger sample size would have produced a null finding. To learn more about heterogeneous treatment effects, see [10 Things to Know about Heterogeneous Treatment Effects](https://methods.egap.org/guides/research-questions/heterogeneous-effects_en.html). -Study Design -== +# Sometimes null results are an artifact of research design: Null results might reflect an underpowered research design. -## 6. Your study sample includes people whose behavior could not be moved by the intervention. -You ask schools to randomize students into an intervention or control (business-as-usual) group. Some students in both your intervention and control groups will always attend school, while some students will rarely attend school, regardless of what interventions are or are not offered to them. Your intervention’s success depends on not just whether students actually receive the message and/or believe it, but also on whether it can shift behavior among such potential responders. +A research design is under-powered when design features undermine the ability to reliably detect a true effect. Inadequate power is a ubiquitous problem in social science research. In a recent working paper reviewing 16,000 hypothesis tests from 2,000 political science articles, @arel-bundock_2023 found that even with generous assumptions the median study had only 10% power. In other words, studies of an intervention with a true effect will generate null results nine out of ten times simply because of insufficient sample size. Only 10% of the studies in the review were powered at 80%, a commonly-used threshold for adequate power. -If the proportion of potential responders is too small, then it may be difficult to detect an effect. In addition, your intervention may need to be targeted and modified in some way to address the needs of potential responders. +Features of research design may contribute to the lack of statistical power. First, + +measurement strategies may not be sensitive enough to capture the changes that occurred. For example, consider whether exercise improves cardiovascular health in 6 months. It will be easier to detect improvements in cardiovascular health through finer measures like resting heart rate than through more blunt measures like whether the individual died. To learn more about measurement, see [10 Things to Know About Measurement in Experiments](https://methods.egap.org/guides/data-strategies/measurement_en.html). + -***How can you tell if***...the proportion of potential responders may be too small? Take a look at the pre-intervention attendance rate. If it is extremely low, does that rate reflect low demand or structural barriers that may limit the potential for response? Is it so high that it tells us that most people who could respond have already done so (say, 85% or higher)? Even if there is a large proportion of hypothetical potential responders, is it lower when you consider existing barriers preventing students from using counseling services that your intervention is not addressing? - -## 7. Your measure is not validated or reliable: It varies too much and systematically across sites. -As a leader of the state’s department of education, you want to measure the effectiveness of your intervention using survey data on student attitudes related to attendance. You learn that only some schools administer a new survey measuring student attitudes, and those with surveys changed the survey items so that there is not the same wording across surveys or schools. If you observe no statistically significant difference on a survey measure that is newly developed or used by only select schools, it may be difficult to know whether the intervention “has no effect” or whether the outcome is measuring something different in each school because of different wording. +Second, research designs with small samples can produce null results even with sensitive measurement strategies. More units are needed to distinguish between a small average effect and a true zero effect than between a large average effect and a true zero effect. To learn more about statistical power, see [10 Things to Know about Statistical Power](https://methods.egap.org/guides/assessing-designs/power_en.html). + -***How can you tell if***... outcome measurement is the problem? Check to see whether the outcome is (1) collected in the same way across your sites and (2) if it means the same thing to the participants as it means to you. In addition, check on any reporting bias and if your study participants or sites face any pressure from inside or outside of their organizations to report or answer in a particular way. - -## 8. Your outcome is not validated or reliable: It varies too little. -Given the problems with the survey measure, you then decide to use administrative data from student records to measure whether students show up on time to school. But it turns out that schools used a generic definition of “on time” such that almost every student looks like they arrive on time. An outcome that does not have enough variation in it to detect an effect between intervention and control groups can be especially limiting if your intervention potentially could have had different effects on different types of students, but the outcome measure used in the study lacks the precision to capture the effects on different subgroups. +# Sometimes null results because designs interact with the real world in unexpected ways: -***How can you tell if***...your null result arises from measures that are too coarse or subject to response biases? Pressures to report a certain kind of outcome faced by people at your sites could again yield this kind of problem with outcome measurement. So, it is again worth investigating the meaning of the outcomes as reported by the sites from the perspective of those doing the reporting. This problem differs from the kind of ceiling and floor effects discussed elsewhere in this guide; it arises more from the strategic calculations of those producing administrative data and less from the natural behavior of those students whose behavior you are trying to change. - -## 9. Your [statistical power](https://methods.egap.org/guides/assessing-designs/power_en.html) is insufficient to detect an effect for the intervention as implemented. -This may sound obvious to people with experience testing interventions at scale. But researchers and policymakers can fall into two traps: +# a. Null results might result from treatments or their effects spilling over from units in one experimental condition to another. -1. Thinking about statistical significance rather than what represents a meaningful and feasible effect. Although a study with an incredibly large sample size can detect small effects with precision, one does not want to trade precision for meaning. Moreover, an intervention known to be weak during the intervention design is likely to be weaker when implemented, especially across multiple sites or months. So it may not be sufficient to simply enroll more subjects to study an intervention known to be weak (even though strong research design cannot compensate for a weak intervention in any easy or direct way); +The intervention or its effects may spill over to units that were not assigned to receive that intervention. For example, if an intervention provides cash transfers to treated units, treated units may share the cash with other units not assigned to that intervention. If an intervention reduces racial prejudice, social connections among units may create shifts in racial attitudes and norms among units assigned to control as well as those assigned to treatment. Although the intervention produces real changes in these situations, the spillovers from the intervention targets to other units makes the changes difficult to detect when comparing study units in one condition to one another. Some research designs anticipate and measure spillovers. To learn more about spillovers, see [10 Things to Know About Spillovers](https://methods.egap.org/guides/data-strategies/spillovers_en.html). -2. Thinking that the only relevant test statistic for an experiment effect is a difference of means (even though we have long known that differences of means are valid but low-powered test statistics when outcomes do not neatly fit into a normal distribution). + -***How can you tell if***...your null result arises mostly from low statistical power? Recall that statistical power depends on (a) effect size or intervention strength, (b) variability in outcomes, (c) the number of independent observations (often well measured with sample size), and (d) the test statistic you use. The previous discussions pointed out ways to learn whether an intervention you thought might be strong was weak, or whether an outcome that you thought might be clear could turn out to be very noisy. +# b. Null results might reflect incomplete implementation of the intervention. -A formal power analysis could also tell you that, given the variability in your outcome and the size of your effect, you would have needed a larger sample size to detect this effect reliably. For example, if you had known about the variability in administration of the treatment or the variability in the outcome (let alone surprises with missing data) in advance, your pre-field power analysis would have told you to use a different sample size. +Units randomly assigned to control may inadvertently receive treatment or units assigned to treatment might not receive it. + +For example, one [study](https://www.gsb.stanford.edu/insights/everything-can-go-wrong-field-experiment-what-do-about-it) aimed to examine the effects of conditional cash transfer programs to improve school attendance. In the control condition, participants should have received unconditional cash transfers regardless of school attendance. However, the government implementing the transfer programs required families to enroll children in school in order to receive the "unconditional" funds, thus making the control group similar to the treatment group. This can bias results toward null findings even if there is actually an effect. -A different test statistic can also change a null result into a positive result if, say, the effect is large but it is not an effect that shifts means as much as moves people who are extreme, or has the effect of making moderate students extreme. A classic example of this problem occurs with outcomes that have very long tails -- such as those involving money, like annual earnings or auction spending. [A t-test might produce a p-value of .20 but a rank-based test might produce a p-value of < .01](https://oes.gsa.gov/projects/gsa-auctions/). The t-test is using evidence of a shift in averages (means) to reflect on the null hypothesis of no effects. The rank-based test is merely asking whether the treatment group outcomes tend to be bigger than (or smaller than) the control group outcomes (whether or not they differ in means). - -Not all nulls are the result of a flaw in design or implementation! -== + + + + +# c. Null results might reflect differential attrition. + +An intervention or its effects may cause treated units to drop out of the study. If units where the treatment was effective are less likely to complete the study, such as when a successful training program might lead a subject to move outside the study area for work, then the study will likely underestimate the treatment's impacts. Differential attrition may then explain null results. To learn more about attrition and see sample code, see [10 Things to Know About Missing Data](https://methods.egap.org/guides/data-strategies/missing-data_en.html). -## 10. Your null needs to be published. -If you addressed all the issues above related to intervention design, sample size and research design, and have a precisely estimated, statistically significant null result, it is time to publish. Your colleagues and other researchers [need to learn from this finding](https://oes.gsa.gov/assets/files/unexpected-results-2-pager.pdf), so do not keep it to yourself. +# Understanding why an intervention did not work is difficult. -When you have a precise null, you do not have a gap in evidence--you are generating evidence. +Understanding why an intervention didn't work as expected can be harder than understanding why it did. Researchers design studies based on their model of how the world works. If a study generates null results, researchers should first investigate whether this may be due to failures in research design, as @humphreys_exporting_2019 do. Researchers can then engage in theory-building to explain their findings. Talking to implementing partners and study participants or conducting additional statistical analyses like balance tests or tests for heterogeneous treatment effects might help researchers identify or rule out some explanations. After this process, researchers can design additional studies to test new explanations and theories. -***What can you do to convince editors and reviewers they should publish your null results?*** This guide should help you reason about your null results and thus explain their importance. If other studies on your topic exist, you can also contextualize your results; for example, follow some of the ideas from @abadie_2020. -For an example, see how @bhatti_et_al_2018--in their study of a Danish governmental voter turnout intervention--used previous work on face-to-face voter turnout (reported on as a meta-analysis in @bhatti_et_al_2016) to contextualize their own small effects. -If you are unable to find a publication willing to include a study with null results in their journal, you can still contribute to the evidence base on the policy area under examination by making your working paper, data, and/or analysis code publicly available. Many researchers choose to do so via their personal websites; in addition, there are repositories (such as the [Open Science Framework](https://osf.io/)) that provide a platform for researchers to share their in-progress and unpublished work. +# References {.unnumbered .unlisted} -# References \ No newline at end of file