Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
1508884
initial stab at a general matrix (no normalisation)
lkirk Oct 26, 2025
b144f8d
added dimension dropping, but I think transposing is better -- we don…
lkirk Oct 26, 2025
92b422a
finalize and add tests for single and multipop
lkirk Dec 4, 2025
8018550
turns out, the general norm function needs to know the state_dims
lkirk Dec 4, 2025
0351141
fix up a bit of naming in general test funcs, remove unneeded branch,…
lkirk Dec 4, 2025
ad920d0
flake8 does not like assigning lambdas to variables
lkirk Dec 4, 2025
8d22e8e
and black doesn't like that
lkirk Dec 4, 2025
e5fcd0e
do not test equality, this was useful on my local machine but is prob…
lkirk Dec 7, 2025
6bb867f
lowlevel tests
lkirk Mar 9, 2026
26bebf5
relax diff requirements (macos failure)
lkirk Mar 9, 2026
667dc65
relax diff requirements (macos failure) -- previous commit fixed one
lkirk Mar 9, 2026
8ea0852
new formatting tools, fix lint
lkirk Mar 9, 2026
2ddf6d0
remove TODOs, old comment and tested elsewhere
lkirk Mar 10, 2026
680f9c5
make testing more clear
petrelharp Mar 15, 2026
5e984be
preserve native dimensions instead of expanding at the end
lkirk Mar 16, 2026
e08abf4
Update tests according to Peters's feedback
lkirk Mar 16, 2026
bd0a1a5
msprime produces different trees on macos (same seed)
lkirk Mar 16, 2026
4c04ff3
Clean up python C tests
lkirk Mar 17, 2026
3f4c0ed
Add/refine tests, draft docstring
lkirk Mar 17, 2026
6685399
regain test coverage for default sample sets
lkirk Mar 17, 2026
70cd6b4
Revert "regain test coverage for default sample sets"
lkirk Mar 17, 2026
be997cf
update comment about result dimension
lkirk Mar 17, 2026
9481921
be more explicit about setting the default norm function
lkirk Mar 17, 2026
7387461
linting does not like assigning lambdas to variables
lkirk Mar 17, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 60 additions & 46 deletions c/tskit/trees.c
Original file line number Diff line number Diff line change
Expand Up @@ -2298,8 +2298,9 @@ get_allele_samples(const tsk_site_t *site, tsk_size_t site_offset,
}

static int
norm_hap_weighted(tsk_size_t result_dim, const double *hap_weights,
tsk_size_t TSK_UNUSED(n_a), tsk_size_t TSK_UNUSED(n_b), double *result, void *params)
norm_hap_weighted(tsk_size_t TSK_UNUSED(state_dim), const double *hap_weights,
tsk_size_t result_dim, tsk_size_t TSK_UNUSED(n_a), tsk_size_t TSK_UNUSED(n_b),
double *result, void *params)
{
sample_count_stat_params_t args = *(sample_count_stat_params_t *) params;
const double *weight_row;
Expand All @@ -2315,8 +2316,9 @@ norm_hap_weighted(tsk_size_t result_dim, const double *hap_weights,
}

static int
norm_hap_weighted_ij(tsk_size_t result_dim, const double *hap_weights,
tsk_size_t TSK_UNUSED(n_a), tsk_size_t TSK_UNUSED(n_b), double *result, void *params)
norm_hap_weighted_ij(tsk_size_t TSK_UNUSED(state_dim), const double *hap_weights,
tsk_size_t result_dim, tsk_size_t TSK_UNUSED(n_a), tsk_size_t TSK_UNUSED(n_b),
double *result, void *params)
{
sample_count_stat_params_t args = *(sample_count_stat_params_t *) params;
const double *weight_row;
Expand All @@ -2341,8 +2343,9 @@ norm_hap_weighted_ij(tsk_size_t result_dim, const double *hap_weights,
}

static int
norm_total_weighted(tsk_size_t result_dim, const double *TSK_UNUSED(hap_weights),
tsk_size_t n_a, tsk_size_t n_b, double *result, void *TSK_UNUSED(params))
norm_total_weighted(tsk_size_t TSK_UNUSED(state_dim),
const double *TSK_UNUSED(hap_weights), tsk_size_t result_dim, tsk_size_t n_a,
tsk_size_t n_b, double *result, void *TSK_UNUSED(params))
{
tsk_size_t k;
double norm = 1 / (double) (n_a * n_b);
Expand Down Expand Up @@ -2411,8 +2414,8 @@ static int
compute_general_normed_two_site_stat_result(const tsk_bitset_t *state,
const tsk_size_t *allele_counts, tsk_size_t a_off, tsk_size_t b_off,
tsk_size_t num_a_alleles, tsk_size_t num_b_alleles, tsk_size_t state_dim,
tsk_size_t result_dim, general_stat_func_t *f, sample_count_stat_params_t *f_params,
norm_func_t *norm_f, bool polarised, two_locus_work_t *restrict work, double *result)
tsk_size_t result_dim, general_stat_func_t *f, void *f_params, norm_func_t *norm_f,
bool polarised, two_locus_work_t *restrict work, double *result)
{
int ret = 0;
// Sample sets and b sites are rows, a sites are columns
Expand Down Expand Up @@ -2445,7 +2448,7 @@ compute_general_normed_two_site_stat_result(const tsk_bitset_t *state,
if (ret != 0) {
goto out;
}
ret = norm_f(result_dim, weights, num_a_alleles - is_polarised,
ret = norm_f(state_dim, weights, result_dim, num_a_alleles - is_polarised,
num_b_alleles - is_polarised, norm, f_params);
if (ret != 0) {
goto out;
Expand All @@ -2463,9 +2466,8 @@ compute_general_normed_two_site_stat_result(const tsk_bitset_t *state,
static int
compute_general_two_site_stat_result(const tsk_bitset_t *state,
const tsk_size_t *allele_counts, tsk_size_t a_off, tsk_size_t b_off,
tsk_size_t state_dim, tsk_size_t result_dim, general_stat_func_t *f,
sample_count_stat_params_t *f_params, two_locus_work_t *restrict work,
double *result)
tsk_size_t state_dim, tsk_size_t result_dim, general_stat_func_t *f, void *f_params,
two_locus_work_t *restrict work, double *result)
{
int ret = 0;
tsk_size_t k;
Expand Down Expand Up @@ -2653,9 +2655,8 @@ static int
tsk_treeseq_two_site_count_stat(const tsk_treeseq_t *self, tsk_size_t state_dim,
tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes,
const tsk_id_t *sample_sets, tsk_size_t result_dim, general_stat_func_t *f,
sample_count_stat_params_t *f_params, norm_func_t *norm_f, tsk_size_t n_rows,
const tsk_id_t *row_sites, tsk_size_t n_cols, const tsk_id_t *col_sites,
tsk_flags_t options, double *result)
void *f_params, norm_func_t *norm_f, tsk_size_t n_rows, const tsk_id_t *row_sites,
tsk_size_t n_cols, const tsk_id_t *col_sites, tsk_flags_t options, double *result)
{
int ret = 0;
tsk_bitset_t allele_samples, allele_sample_sets;
Expand Down Expand Up @@ -3089,9 +3090,8 @@ advance_collect_edges(iter_state *s, tsk_id_t index)
static int
compute_two_tree_branch_state_update(const tsk_treeseq_t *ts, tsk_id_t c,
const iter_state *A_state, const iter_state *B_state, tsk_size_t state_dim,
tsk_size_t result_dim, int sign, general_stat_func_t *f,
sample_count_stat_params_t *f_params, two_locus_work_t *restrict work,
double *result)
tsk_size_t result_dim, int sign, general_stat_func_t *f, void *f_params,
two_locus_work_t *restrict work, double *result)
{
int ret = 0;
double a_len, b_len;
Expand Down Expand Up @@ -3141,8 +3141,8 @@ compute_two_tree_branch_state_update(const tsk_treeseq_t *ts, tsk_id_t c,

static int
compute_two_tree_branch_stat(const tsk_treeseq_t *ts, const iter_state *l_state,
iter_state *r_state, general_stat_func_t *f, sample_count_stat_params_t *f_params,
tsk_size_t result_dim, tsk_size_t state_dim, double *result)
iter_state *r_state, general_stat_func_t *f, void *f_params, tsk_size_t result_dim,
tsk_size_t state_dim, double *result)
{
int ret = 0;
tsk_id_t e, c, ec, p, *updated_nodes = NULL;
Expand Down Expand Up @@ -3243,9 +3243,9 @@ static int
tsk_treeseq_two_branch_count_stat(const tsk_treeseq_t *self, tsk_size_t state_dim,
tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes,
const tsk_id_t *sample_sets, tsk_size_t result_dim, general_stat_func_t *f,
sample_count_stat_params_t *f_params, norm_func_t *TSK_UNUSED(norm_f),
tsk_size_t n_rows, const double *row_positions, tsk_size_t n_cols,
const double *col_positions, tsk_flags_t TSK_UNUSED(options), double *result)
void *f_params, norm_func_t *TSK_UNUSED(norm_f), tsk_size_t n_rows,
const double *row_positions, tsk_size_t n_cols, const double *col_positions,
tsk_flags_t TSK_UNUSED(options), double *result)
{
int ret = 0;
int r, c;
Expand Down Expand Up @@ -3385,10 +3385,10 @@ check_sample_set_dups(tsk_size_t num_sample_sets, const tsk_size_t *sample_set_s
}

int
tsk_treeseq_two_locus_count_stat(const tsk_treeseq_t *self, tsk_size_t num_sample_sets,
const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets,
tsk_size_t result_dim, const tsk_id_t *set_indexes, general_stat_func_t *f,
norm_func_t *norm_f, tsk_size_t out_rows, const tsk_id_t *row_sites,
tsk_treeseq_two_locus_count_general_stat(const tsk_treeseq_t *self,
tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes,
const tsk_id_t *sample_sets, tsk_size_t result_dim, general_stat_func_t *f,
void *f_params, norm_func_t *norm_f, tsk_size_t out_rows, const tsk_id_t *row_sites,
const double *row_positions, tsk_size_t out_cols, const tsk_id_t *col_sites,
const double *col_positions, tsk_flags_t options, double *result)
{
Expand All @@ -3398,10 +3398,6 @@ tsk_treeseq_two_locus_count_stat(const tsk_treeseq_t *self, tsk_size_t num_sampl
bool stat_site = !!(options & TSK_STAT_SITE);
bool stat_branch = !!(options & TSK_STAT_BRANCH);
tsk_size_t state_dim = num_sample_sets;
sample_count_stat_params_t f_params = { .sample_sets = sample_sets,
.num_sample_sets = num_sample_sets,
.sample_set_sizes = sample_set_sizes,
.set_indexes = set_indexes };

// We do not support two-locus node stats
if (!!(options & TSK_STAT_NODE)) {
Expand Down Expand Up @@ -3441,27 +3437,45 @@ tsk_treeseq_two_locus_count_stat(const tsk_treeseq_t *self, tsk_size_t num_sampl
goto out;
}
ret = tsk_treeseq_two_site_count_stat(self, state_dim, num_sample_sets,
sample_set_sizes, sample_sets, result_dim, f, &f_params, norm_f, out_rows,
sample_set_sizes, sample_sets, result_dim, f, f_params, norm_f, out_rows,
row_sites, out_cols, col_sites, options, result);
} else if (stat_branch) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so proposal was to just change this to

} else {
  tsk_bug_assert(stat_branch);

ret = check_positions(
row_positions, out_rows, tsk_treeseq_get_sequence_length(self));
if (ret != 0) {
goto out;
}
ret = check_positions(
col_positions, out_cols, tsk_treeseq_get_sequence_length(self));
if (ret != 0) {
goto out;
}
ret = tsk_treeseq_two_branch_count_stat(self, state_dim, num_sample_sets,
sample_set_sizes, sample_sets, result_dim, f, &f_params, norm_f, out_rows,
row_positions, out_cols, col_positions, options, result);
goto out;
}
tsk_bug_assert(stat_branch);
ret = check_positions(
row_positions, out_rows, tsk_treeseq_get_sequence_length(self));
if (ret != 0) {
goto out;
}
ret = check_positions(
col_positions, out_cols, tsk_treeseq_get_sequence_length(self));
if (ret != 0) {
goto out;
}
ret = tsk_treeseq_two_branch_count_stat(self, state_dim, num_sample_sets,
sample_set_sizes, sample_sets, result_dim, f, f_params, norm_f, out_rows,
row_positions, out_cols, col_positions, options, result);
out:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know why codecov says this line is not covered; do you? (sticking some prints or asserts in here can help track that down)

Image

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Judging from the coverage of the rest of that function (every goto being hit), my best guess is that the compiler is optimizing that label out. In fact, I could just change all of those goto ret to return ret and remove the out: label since we're not doing any sort of cleanup at the end of this function.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't do that; it's nice to have that pattern in the code.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh! I see why this is complaining here. It's because right before this line is if (A) { ... } else if (B) { ... }, but there are no tests in which both A and B are false. And in fact it's impossible for this to happen. So the "else if ( )" should be an "else"; and if you want to be extra paranoid stick a tsk_bug_assert(B) in the code at the top of the else { }.

Every time I think "oh no codecov has this one wrong" it turns out that it's got a good point.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I ended up removing the branch altogether and added tsk_bug_assert(mode_branch)

Copy link
Contributor

@petrelharp petrelharp Mar 18, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, that works. But - and sorry to be nitpicky (about so many things) but how about this goes inside an else { }? I think that's a lot more clear: for instance, the "site mode" and "branch mode" paths look symmetric, then.

return ret;
}

int
tsk_treeseq_two_locus_count_stat(const tsk_treeseq_t *self, tsk_size_t num_sample_sets,
const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets,
tsk_size_t result_dim, const tsk_id_t *set_indexes, general_stat_func_t *f,
norm_func_t *norm_f, tsk_size_t out_rows, const tsk_id_t *row_sites,
const double *row_positions, tsk_size_t out_cols, const tsk_id_t *col_sites,
const double *col_positions, tsk_flags_t options, double *result)
{
sample_count_stat_params_t f_params = { .sample_sets = sample_sets,
.num_sample_sets = num_sample_sets,
.sample_set_sizes = sample_set_sizes,
.set_indexes = set_indexes };
return tsk_treeseq_two_locus_count_general_stat(self, num_sample_sets,
sample_set_sizes, sample_sets, result_dim, f, &f_params, norm_f, out_rows,
row_sites, row_positions, out_cols, col_sites, col_positions, options, result);
}

/***********************************
* Allele frequency spectrum
***********************************/
Expand Down
11 changes: 9 additions & 2 deletions c/tskit/trees.h
Original file line number Diff line number Diff line change
Expand Up @@ -1036,8 +1036,8 @@ int tsk_treeseq_general_stat(const tsk_treeseq_t *self, tsk_size_t K, const doub
tsk_size_t M, general_stat_func_t *f, void *f_params, tsk_size_t num_windows,
const double *windows, tsk_flags_t options, double *result);

typedef int norm_func_t(tsk_size_t result_dim, const double *hap_weights, tsk_size_t n_a,
tsk_size_t n_b, double *result, void *params);
typedef int norm_func_t(tsk_size_t state_dim, const double *hap_weights,
tsk_size_t result_dim, tsk_size_t n_a, tsk_size_t n_b, double *result, void *params);

int tsk_treeseq_two_locus_count_stat(const tsk_treeseq_t *self,
tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes,
Expand Down Expand Up @@ -1120,6 +1120,13 @@ typedef int general_sample_stat_method(const tsk_treeseq_t *self,
const tsk_id_t *sample_sets, tsk_size_t num_indexes, const tsk_id_t *indexes,
tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result);

int tsk_treeseq_two_locus_count_general_stat(const tsk_treeseq_t *self,
tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes,
const tsk_id_t *sample_sets, tsk_size_t result_dim, general_stat_func_t *f,
void *f_params, norm_func_t *norm_f, tsk_size_t out_rows, const tsk_id_t *row_sites,
const double *row_positions, tsk_size_t out_cols, const tsk_id_t *col_sites,
const double *col_positions, tsk_flags_t options, double *result);

typedef int two_locus_count_stat_method(const tsk_treeseq_t *self,
tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes,
const tsk_id_t *sample_sets, tsk_size_t num_rows, const tsk_id_t *row_sites,
Expand Down
Loading
Loading