diff --git a/configure.ac b/configure.ac index 25032ac7f..8e39bc5e3 100644 --- a/configure.ac +++ b/configure.ac @@ -162,6 +162,35 @@ HTS_CHECK_COMPILE_FLAGS_NEEDED([avx512f], [-mavx512f -mpopcnt], AC_DEFINE([HAVE_AVX512],1,[Defined to 1 if rANS source using AVX512F can be compiled.]) ]) +dnl Check for presence of builtin_clz +AC_MSG_CHECKING([for working __builtin_clz]) +AC_LINK_IFELSE([AC_LANG_PROGRAM([],[ + int z = 10; + if (__builtin_clz(z)) { + return 0; + } +])], [ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE___BUILTIN_CLZ], 1, + [Defined to 1 if __builtin_clz works]) +], [ + AC_MSG_RESULT([no]) +]) + +dnl Check for presence of builtin_prefetch +AC_MSG_CHECKING([for working __builtin_prefetch]) +AC_LINK_IFELSE([AC_LANG_PROGRAM([],[ + int z = 10; + __builtin_prefetch(&z); + return 0; +])], [ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE___BUILTIN_PREFETCH], 1, + [Defined to 1 if __builtin_prefetch works]) +], [ + AC_MSG_RESULT([no]) +]) + dnl Check for working __builtin_cpu_supports (ssse3 is broken on some clangs) AC_MSG_CHECKING([for working __builtin_cpu_supports("ssse3")]) AC_LINK_IFELSE([AC_LANG_PROGRAM([],[ diff --git a/htslib/hts_defs.h b/htslib/hts_defs.h index 71eb88d46..d7942f2d9 100644 --- a/htslib/hts_defs.h +++ b/htslib/hts_defs.h @@ -138,4 +138,17 @@ DEALINGS IN THE SOFTWARE. */ #define HTSLIB_EXPORT #endif +// Prefetch implementations. +// We only support a basic implementation here +#ifdef HAVE___BUILTIN_PREFETCH +static inline void hts_prefetch(void *p) { + __builtin_prefetch(p); +} +#else +static inline void hts_prefetch(void *p) { + // Fetch and discard is quite close to a genuine prefetch + *(volatile char *)p; +} +#endif + #endif diff --git a/sam.c b/sam.c index 0a386fe1d..11e3f3e69 100644 --- a/sam.c +++ b/sam.c @@ -5977,6 +5977,9 @@ static void overlap_remove(bam_plp_t iter, const bam1_t *b) { if ( !iter->overlaps ) return; + if ( b->core.flag&BAM_FUNMAP || !(b->core.flag&BAM_FPROPER_PAIR) ) //no need + return; + khiter_t kitr; if ( b ) { @@ -6007,6 +6010,8 @@ const bam_pileup1_t *bam_plp64_next(bam_plp_t iter, int *_tid, hts_pos_t *_pos, // write iter->plp at iter->pos lbnode_t **pptr = &iter->head; while (*pptr != iter->tail) { + if ((*pptr)->next) + hts_prefetch((*pptr)->next); lbnode_t *p = *pptr; if (p->b.core.tid < iter->tid || (p->b.core.tid == iter->tid && p->end <= iter->pos)) { // then remove overlap_remove(iter, &p->b);