Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions TODO.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ Deferred items from PR reviews that were not addressed before merge.
| ImputationDiD dense `(A0'A0).toarray()` scales O((U+T+K)^2), OOM risk on large panels | `imputation.py` | #141 | Medium (deferred — only triggers when sparse solver fails; fixing requires sparse least-squares alternatives) |
| EfficientDiD: API docs / tutorial page for new public estimator | `docs/` | #192 | Medium |
| Multi-absorb weighted demeaning needs iterative alternating projections for N > 1 absorbed FE with survey weights; unweighted multi-absorb also uses single-pass (pre-existing, exact only for balanced panels) | `estimators.py` | #218 | Medium |
| CallawaySantAnna per-cell ATT(g,t) SEs under survey use influence-function variance, not full design-based TSL with strata/PSU/FPC. Design effects enter at aggregation via WIF and survey df. Full per-cell TSL would require constructing unit-level influence functions on the global index and passing through `compute_survey_vcov()`. | `staggered.py` | — | Medium |
| EfficientDiD hausman_pretest() clustered covariance uses stale `n_cl` after filtering non-finite EIF rows — should recompute effective cluster count and remap indices after `row_finite` filtering | `efficient_did.py` | #230 | Medium |
| TripleDifference power: `generate_ddd_data` is a fixed 2×2×2 cross-sectional DGP — no multi-period or unbalanced-group support. Add a `generate_ddd_panel_data` for panel DDD power analysis. | `prep_dgp.py`, `power.py` | #208 | Low |
| ContinuousDiD event-study aggregation does not filter by `anticipation` — uses all (g,t) cells instead of anticipation-filtered subset; pre-existing in both survey and non-survey paths | `continuous_did.py` | #226 | Medium |
| Survey design resolution/collapse patterns are inconsistent across panel estimators — ContinuousDiD rebuilds unit-level design in SE code, EfficientDiD builds once in fit(), StackedDiD re-resolves on stacked data; extract shared helpers for panel-to-unit collapse, post-filter re-resolution, and metadata recomputation | `continuous_did.py`, `efficient_did.py`, `stacked_did.py` | #226 | Low |
Expand Down
280 changes: 234 additions & 46 deletions diff_diff/imputation.py

Large diffs are not rendered by default.

23 changes: 23 additions & 0 deletions diff_diff/imputation_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,8 @@ class ImputationDiDResults:
bootstrap_results: Optional[ImputationBootstrapResults] = field(default=None, repr=False)
# Internal: stores data needed for pretrend_test()
_estimator_ref: Optional[Any] = field(default=None, repr=False)
# Survey design metadata (SurveyMetadata instance from diff_diff.survey)
survey_metadata: Optional[Any] = field(default=None, repr=False)

def __repr__(self) -> str:
"""Concise string representation."""
Expand Down Expand Up @@ -182,6 +184,27 @@ def summary(self, alpha: Optional[float] = None) -> str:
"",
]

# Survey design info
if self.survey_metadata is not None:
sm = self.survey_metadata
lines.extend(
[
"-" * 85,
"Survey Design".center(85),
"-" * 85,
f"{'Weight type:':<30} {sm.weight_type:>10}",
]
)
if sm.n_strata is not None:
lines.append(f"{'Strata:':<30} {sm.n_strata:>10}")
if sm.n_psu is not None:
lines.append(f"{'PSU/Cluster:':<30} {sm.n_psu:>10}")
lines.append(f"{'Effective sample size:':<30} {sm.effective_n:>10.1f}")
lines.append(f"{'Design effect (DEFF):':<30} {sm.design_effect:>10.2f}")
if sm.df_survey is not None:
lines.append(f"{'Survey d.f.:':<30} {sm.df_survey:>10}")
lines.extend(["-" * 85, ""])

# Overall ATT
lines.extend(
[
Expand Down
56 changes: 27 additions & 29 deletions diff_diff/linalg.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,24 +390,18 @@ def _validate_weights(weights, weight_type, n):
"""Validate weights array and weight_type for solve_ols/LinearRegression."""
if weight_type not in _VALID_WEIGHT_TYPES:
raise ValueError(
f"weight_type must be one of {_VALID_WEIGHT_TYPES}, "
f"got '{weight_type}'"
f"weight_type must be one of {_VALID_WEIGHT_TYPES}, " f"got '{weight_type}'"
)
if weights is not None:
weights = np.asarray(weights, dtype=np.float64)
if weights.shape[0] != n:
raise ValueError(
f"weights length ({weights.shape[0]}) must match "
f"X rows ({n})"
)
raise ValueError(f"weights length ({weights.shape[0]}) must match " f"X rows ({n})")
if np.any(np.isnan(weights)):
raise ValueError("Weights contain NaN values")
if np.any(np.isinf(weights)):
raise ValueError("Weights contain Inf values")
if np.any(weights < 0):
raise ValueError(
"Weights must be non-negative"
)
raise ValueError("Weights must be non-negative")
if weight_type == "fweight":
fractional = weights - np.round(weights)
if np.any(np.abs(fractional) > 1e-10):
Expand Down Expand Up @@ -693,13 +687,9 @@ def solve_ols(
weights=weights,
weight_type=weight_type,
)
vcov_out = _expand_vcov_with_nan(
vcov_reduced, _original_X.shape[1], kept_cols
)
vcov_out = _expand_vcov_with_nan(vcov_reduced, _original_X.shape[1], kept_cols)
else:
vcov_out = np.full(
(_original_X.shape[1], _original_X.shape[1]), np.nan
)
vcov_out = np.full((_original_X.shape[1], _original_X.shape[1]), np.nan)
else:
vcov_out = _compute_robust_vcov_numpy(
_original_X,
Expand Down Expand Up @@ -1122,6 +1112,7 @@ def solve_logit(
tol: float = 1e-8,
check_separation: bool = True,
rank_deficient_action: str = "warn",
weights: Optional[np.ndarray] = None,
) -> Tuple[np.ndarray, np.ndarray]:
"""
Fit logistic regression via IRLS (Fisher scoring).
Expand All @@ -1147,6 +1138,13 @@ def solve_logit(
- "warn": Emit warning and drop columns (default)
- "error": Raise ValueError
- "silent": Drop columns silently
weights : np.ndarray, optional
Survey/observation weights of shape (n_samples,). When provided,
the IRLS working weights become ``weights * mu * (1 - mu)``
instead of ``mu * (1 - mu)``. This produces the survey-weighted
maximum likelihood estimator, matching R's ``svyglm(family=binomial)``.
When None (default), behavior is identical to unweighted logistic
regression.

Returns
-------
Expand Down Expand Up @@ -1203,11 +1201,16 @@ def solve_logit(
mu = np.clip(mu, 1e-10, 1 - 1e-10)

# Working weights and working response
w = mu * (1.0 - mu)
z = eta + (y - mu) / w
w_irls = mu * (1.0 - mu)
z = eta + (y - mu) / w_irls

if weights is not None:
w_total = weights * w_irls
else:
w_total = w_irls

# Weighted least squares: solve (X'WX) beta = X'Wz
sqrt_w = np.sqrt(w)
sqrt_w = np.sqrt(w_total)
Xw = X_solve * sqrt_w[:, None]
zw = z * sqrt_w
beta_new, _, _, _ = np.linalg.lstsq(Xw, zw, rcond=None)
Expand Down Expand Up @@ -1593,10 +1596,7 @@ def fit(
_use_survey_vcov = self.survey_design.needs_survey_vcov
# Canonicalize weights from survey_design to ensure consistency
# between coefficient estimation and survey vcov computation
if (
self.weights is not None
and self.weights is not self.survey_design.weights
):
if self.weights is not None and self.weights is not self.survey_design.weights:
warnings.warn(
"Explicit weights= differ from survey_design.weights. "
"Using survey_design weights for both coefficient "
Expand All @@ -1609,9 +1609,7 @@ def fit(
self.weight_type = self.survey_design.weight_type

if self.weights is not None:
self.weights = _validate_weights(
self.weights, self.weight_type, X.shape[0]
)
self.weights = _validate_weights(self.weights, self.weight_type, X.shape[0])

# Inject cluster as PSU for survey variance when no PSU specified.
# Use a local variable to avoid mutating self.survey_design, which
Expand All @@ -1622,7 +1620,9 @@ def fit(
and _effective_survey_design is not None
and _use_survey_vcov
):
from diff_diff.survey import ResolvedSurveyDesign as _RSD, _inject_cluster_as_psu
from diff_diff.survey import ResolvedSurveyDesign as _RSD
from diff_diff.survey import _inject_cluster_as_psu

if isinstance(_effective_survey_design, _RSD) and _effective_survey_design.psu is None:
_effective_survey_design = _inject_cluster_as_psu(
_effective_survey_design, effective_cluster_ids
Expand Down Expand Up @@ -1864,9 +1864,7 @@ def get_inference(
# Use project-standard NaN-safe inference (returns all-NaN when SE <= 0)
from diff_diff.utils import safe_inference

t_stat, p_value, conf_int = safe_inference(
coef, se, alpha=effective_alpha, df=effective_df
)
t_stat, p_value, conf_int = safe_inference(coef, se, alpha=effective_alpha, df=effective_df)

return InferenceResult(
coefficient=coef,
Expand Down
Loading
Loading