From 2fc73aca6a9e78bbb4750b8b3a985dfd628895b5 Mon Sep 17 00:00:00 2001 From: vasudeva8 Date: Thu, 5 Feb 2026 10:45:18 +0000 Subject: [PATCH 1/2] added prefetch and minor updates to speedup pileup --- sam.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sam.c b/sam.c index 0a386fe1d..69205ec73 100644 --- a/sam.c +++ b/sam.c @@ -5977,6 +5977,9 @@ static void overlap_remove(bam_plp_t iter, const bam1_t *b) { if ( !iter->overlaps ) return; + if ( b->core.flag&BAM_FUNMAP || !(b->core.flag&BAM_FPROPER_PAIR) ) //no need + return; + khiter_t kitr; if ( b ) { @@ -6007,6 +6010,8 @@ const bam_pileup1_t *bam_plp64_next(bam_plp_t iter, int *_tid, hts_pos_t *_pos, // write iter->plp at iter->pos lbnode_t **pptr = &iter->head; while (*pptr != iter->tail) { + if ((*pptr)->next) + __builtin_prefetch((*pptr)->next); lbnode_t *p = *pptr; if (p->b.core.tid < iter->tid || (p->b.core.tid == iter->tid && p->end <= iter->pos)) { // then remove overlap_remove(iter, &p->b); From 572ecc78e5fd498a814804653192ebae64030388 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 16 Feb 2026 14:29:03 +0000 Subject: [PATCH 2/2] Add autoconf check for __builtin_prefetch This is to facilitate the use of prefetch on deep pileups to avoid memory latency. On a Covid sample (local 34005_1#18.fixmate.bam file) of 150-200k depth, the perf report -n stats for bam_plp64_next are: prefetch 16523 volatile 16839 none 17895 These are the 5th best time of 10 runs, showing 8% gain for prefetching and a 6% gain for the poor-man's alternative. On shallow data it's not beneficial, but at a cost of ~2% instead. (We may wish to consider removing the volatile case as it's a little more costly on shallow data and a little less beneficial on deep, but it's going to be rarely used regardless.) NB: speed has not been checked on ARM CPUs. Also added a check for __builtin_clz which we already use elsewhere via compiler version checks, as we may as well add both. --- configure.ac | 29 +++++++++++++++++++++++++++++ htslib/hts_defs.h | 13 +++++++++++++ sam.c | 2 +- 3 files changed, 43 insertions(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index 25032ac7f..8e39bc5e3 100644 --- a/configure.ac +++ b/configure.ac @@ -162,6 +162,35 @@ HTS_CHECK_COMPILE_FLAGS_NEEDED([avx512f], [-mavx512f -mpopcnt], AC_DEFINE([HAVE_AVX512],1,[Defined to 1 if rANS source using AVX512F can be compiled.]) ]) +dnl Check for presence of builtin_clz +AC_MSG_CHECKING([for working __builtin_clz]) +AC_LINK_IFELSE([AC_LANG_PROGRAM([],[ + int z = 10; + if (__builtin_clz(z)) { + return 0; + } +])], [ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE___BUILTIN_CLZ], 1, + [Defined to 1 if __builtin_clz works]) +], [ + AC_MSG_RESULT([no]) +]) + +dnl Check for presence of builtin_prefetch +AC_MSG_CHECKING([for working __builtin_prefetch]) +AC_LINK_IFELSE([AC_LANG_PROGRAM([],[ + int z = 10; + __builtin_prefetch(&z); + return 0; +])], [ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE___BUILTIN_PREFETCH], 1, + [Defined to 1 if __builtin_prefetch works]) +], [ + AC_MSG_RESULT([no]) +]) + dnl Check for working __builtin_cpu_supports (ssse3 is broken on some clangs) AC_MSG_CHECKING([for working __builtin_cpu_supports("ssse3")]) AC_LINK_IFELSE([AC_LANG_PROGRAM([],[ diff --git a/htslib/hts_defs.h b/htslib/hts_defs.h index 71eb88d46..d7942f2d9 100644 --- a/htslib/hts_defs.h +++ b/htslib/hts_defs.h @@ -138,4 +138,17 @@ DEALINGS IN THE SOFTWARE. */ #define HTSLIB_EXPORT #endif +// Prefetch implementations. +// We only support a basic implementation here +#ifdef HAVE___BUILTIN_PREFETCH +static inline void hts_prefetch(void *p) { + __builtin_prefetch(p); +} +#else +static inline void hts_prefetch(void *p) { + // Fetch and discard is quite close to a genuine prefetch + *(volatile char *)p; +} +#endif + #endif diff --git a/sam.c b/sam.c index 69205ec73..11e3f3e69 100644 --- a/sam.c +++ b/sam.c @@ -6011,7 +6011,7 @@ const bam_pileup1_t *bam_plp64_next(bam_plp_t iter, int *_tid, hts_pos_t *_pos, lbnode_t **pptr = &iter->head; while (*pptr != iter->tail) { if ((*pptr)->next) - __builtin_prefetch((*pptr)->next); + hts_prefetch((*pptr)->next); lbnode_t *p = *pptr; if (p->b.core.tid < iter->tid || (p->b.core.tid == iter->tid && p->end <= iter->pos)) { // then remove overlap_remove(iter, &p->b);