From c16de9d1de13553b3c68999d883a8b36ed5b526b Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Tue, 21 Apr 2026 15:30:20 -0400
Subject: [PATCH 01/33] Simplify index manipulation API with unified in-place
 interface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Merge the two-tier `permute!`/`add_permute!` pattern into a single tier: `permute!`, `braid!`, `transpose!`, and `repartition!` now directly accept optional `α`, `β`, `backend`, and `allocator` arguments with sensible defaults (One(), Zero(), DefaultBackend(), DefaultAllocator()), matching TensorOperations convention. The old `add_permute!`, `add_braid!`, and `add_transpose!` are deprecated wrappers that emit `Base.depwarn` and forward to the new functions. The `allocator` kwarg is fully threaded through the internal call chain (`add_transform!`, all kernel functions, and `allocate_buffers`). Mooncake AD rules are updated to use the new function names.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../indexmanipulations.jl                     |  36 +--
 ext/TensorKitMooncakeExt/planaroperations.jl  |   2 +-
 src/planar/planaroperations.jl                |   6 +-
 src/tensors/braidingtensor.jl                 |  29 +-
 src/tensors/indexmanipulations.jl             | 294 ++++++++++--------
 src/tensors/tensoroperations.jl               |   4 +-
 src/tensors/treetransformers.jl               |   6 +-
 test/mooncake/indexmanipulations.jl           |  26 +-
 8 files changed, 217 insertions(+), 186 deletions(-)

diff --git a/ext/TensorKitMooncakeExt/indexmanipulations.jl b/ext/TensorKitMooncakeExt/indexmanipulations.jl
index c3bc3e26c..1fccdd9e6 100644
--- a/ext/TensorKitMooncakeExt/indexmanipulations.jl
+++ b/ext/TensorKitMooncakeExt/indexmanipulations.jl
@@ -1,11 +1,11 @@
 for transform in (:permute, :transpose)
-    add_transform! = Symbol(:add_, transform, :!)
-    add_transform_pullback = Symbol(add_transform!, :_pullback)
+    transform! = Symbol(transform, :!)
+    transform_pullback = Symbol(transform!, :_pullback)
     @eval @is_primitive(
         DefaultCtx,
         ReverseMode,
         Tuple{
-            typeof(TK.$add_transform!),
+            typeof(TK.$transform!),
             AbstractTensorMap,
             AbstractTensorMap, Index2Tuple,
             Number, Number, Vararg{Any},
@@ -13,7 +13,7 @@ for transform in (:permute, :transpose)
     )
 
     @eval function Mooncake.rrule!!(
-            ::CoDual{typeof(TK.$add_transform!)},
+            ::CoDual{typeof(TK.$transform!)},
             C_ΔC::CoDual{<:AbstractTensorMap},
             A_ΔA::CoDual{<:AbstractTensorMap}, p_Δp::CoDual{<:Index2Tuple},
             α_Δα::CoDual{<:Number}, β_Δβ::CoDual{<:Number},
@@ -30,17 +30,17 @@ for transform in (:permute, :transpose)
 
         # if we need to compute Δa, it is faster to allocate an intermediate permuted A
         # and store that instead of repeating the permutation in the pullback each time.
-        # effectively, we replace `add_permute` by `add ∘ permute`.
+        # effectively, we replace `permute!/transpose!` by `add ∘ permute/transpose`.
         Ap = if _needs_tangent(α)
             Ap = $transform(A, p)
             add!(C, Ap, α, β)
             Ap
         else
-            TK.$add_transform!(C, A, p, α, β, ba...)
+            TK.$transform!(C, A, p, α, β, ba...)
             nothing
         end
 
-        function $add_transform_pullback(::NoRData)
+        function $transform_pullback(::NoRData)
             copy!(C, C_cache)
 
             # ΔA
@@ -50,10 +50,10 @@ for transform in (:permute, :transpose)
             TC = VectorInterface.promote_scale(ΔC, α)
             if scalartype(ΔA) <: Real && !(TC <: Real)
                 ΔAc = TO.tensoralloc_add(TC, ΔC, pΔA, false, Val(false))
-                TK.$add_transform!(ΔAc, ΔC, pΔA, conj(α), Zero(), ba...)
+                TK.$transform!(ΔAc, ΔC, pΔA, conj(α), Zero(), ba...)
                 add!(ΔA, real(ΔAc))
             else
-                TK.$add_transform!(ΔA, ΔC, pΔA, conj(α), One(), ba...)
+                TK.$transform!(ΔA, ΔC, pΔA, conj(α), One(), ba...)
             end
             ΔAr = NoRData()
 
@@ -64,7 +64,7 @@ for transform in (:permute, :transpose)
             return NoRData(), ΔCr, ΔAr, NoRData(), Δαr, Δβr, map(Returns(NoRData()), ba)...
         end
 
-        return C_ΔC, $add_transform_pullback
+        return C_ΔC, $transform_pullback
     end
 end
 
@@ -72,7 +72,7 @@ end
     DefaultCtx,
     ReverseMode,
     Tuple{
-        typeof(TK.add_braid!),
+        typeof(TK.braid!),
         AbstractTensorMap,
         AbstractTensorMap, Index2Tuple, IndexTuple,
         Number, Number, Vararg{Any},
@@ -80,7 +80,7 @@ end
 )
 
 function Mooncake.rrule!!(
-        ::CoDual{typeof(TK.add_braid!)},
+        ::CoDual{typeof(TK.braid!)},
         C_ΔC::CoDual{<:AbstractTensorMap},
         A_ΔA::CoDual{<:AbstractTensorMap}, p_Δp::CoDual{<:Index2Tuple}, levels_Δlevels::CoDual{<:IndexTuple},
         α_Δα::CoDual{<:Number}, β_Δβ::CoDual{<:Number},
@@ -98,17 +98,17 @@ function Mooncake.rrule!!(
 
     # if we need to compute Δa, it is faster to allocate an intermediate braided A
     # and store that instead of repeating the permutation in the pullback each time.
-    # effectively, we replace `add_permute` by `add ∘ permute`.
+    # effectively, we replace `braid!` by `add ∘ braid`.
     Ap = if _needs_tangent(α)
         Ap = braid(A, p, levels)
         add!(C, Ap, α, β)
         Ap
     else
-        TK.add_braid!(C, A, p, levels, α, β, ba...)
+        TK.braid!(C, A, p, levels, α, β, ba...)
         nothing
     end
 
-    function add_braid!_pullback(::NoRData)
+    function braid!_pullback(::NoRData)
         copy!(C, C_cache)
 
         # ΔA
@@ -118,10 +118,10 @@ function Mooncake.rrule!!(
         TC = VectorInterface.promote_scale(ΔC, α)
         if scalartype(ΔA) <: Real && !(TC <: Real)
             ΔAc = TO.tensoralloc_add(TC, ΔC, pΔA, false, Val(false))
-            TK.add_braid!(ΔAc, ΔC, pΔA, ilevels, conj(α), Zero(), ba...)
+            TK.braid!(ΔAc, ΔC, pΔA, ilevels, conj(α), Zero(), ba...)
             add!(ΔA, real(ΔAc))
         else
-            TK.add_braid!(ΔA, ΔC, pΔA, ilevels, conj(α), One(), ba...)
+            TK.braid!(ΔA, ΔC, pΔA, ilevels, conj(α), One(), ba...)
         end
         ΔAr = NoRData()
 
@@ -132,7 +132,7 @@ function Mooncake.rrule!!(
         return NoRData(), ΔCr, ΔAr, NoRData(), NoRData(), Δαr, Δβr, map(Returns(NoRData()), ba)...
     end
 
-    return C_ΔC, add_braid!_pullback
+    return C_ΔC, braid!_pullback
 end
 
 # both are needed for correctly capturing every dispatch
diff --git a/ext/TensorKitMooncakeExt/planaroperations.jl b/ext/TensorKitMooncakeExt/planaroperations.jl
index 3c75fe2da..abbef5004 100644
--- a/ext/TensorKitMooncakeExt/planaroperations.jl
+++ b/ext/TensorKitMooncakeExt/planaroperations.jl
@@ -60,7 +60,7 @@
 #     if length(q[1]) == 0
 #         ip = invperm(linearize(p))
 #         pΔA = _repartition(ip, A)
-#         TK.add_transpose!(ΔA, ΔC, pΔA, conj(α), One(), backend, allocator)
+#         TK.transpose!(ΔA, ΔC, pΔA, conj(α), One(), backend, allocator)
 #         return NoRData()
 #     end
 #     # if length(q[1]) == 1
diff --git a/src/planar/planaroperations.jl b/src/planar/planaroperations.jl
index cde772982..758bb708a 100644
--- a/src/planar/planaroperations.jl
+++ b/src/planar/planaroperations.jl
@@ -32,7 +32,7 @@ function planaradd!(
         α::Number, β::Number,
         backend, allocator
     )
-    return add_transpose!(C, A, p, α, β, backend)
+    return transpose!(C, A, p, α, β, backend)
 end
 
 # insert default backend
@@ -173,7 +173,7 @@ function planarcontract!(
         A′ = TO.tensoralloc_add(
             scalartype(A), A, (oindA, cindA), false, Val(true), allocator
         )
-        add_transpose!(A′, A, (oindA, cindA), One(), Zero(), backend)
+        transpose!(A′, A, (oindA, cindA), One(), Zero(), backend)
     end
 
     if cindB == codB && oindB == domB
@@ -182,7 +182,7 @@ function planarcontract!(
         B′ = TensorOperations.tensoralloc_add(
             scalartype(B), B, (cindB, oindB), false, Val(true), allocator
         )
-        add_transpose!(B′, B, (cindB, oindB), One(), Zero(), backend)
+        transpose!(B′, B, (cindB, oindB), One(), Zero(), backend)
     end
     mul!(C, A′, B′, α, β)
     (oindA == codA && cindA == domA) || TO.tensorfree!(A′, allocator)
diff --git a/src/tensors/braidingtensor.jl b/src/tensors/braidingtensor.jl
index d28b2e1df..a3a131c3d 100644
--- a/src/tensors/braidingtensor.jl
+++ b/src/tensors/braidingtensor.jl
@@ -221,7 +221,7 @@ function planarcontract!(
 
     I = sectortype(C)
     BraidingStyle(I) isa Bosonic &&
-        return add_permute!(C, B, (reverse(cindB), oindB), α, β, backend)
+        return permute!(C, B, (reverse(cindB), oindB), α, β, backend, allocator)
 
     # Non-bosonic case: factor into a cyclic transpose (no crossings) + a single Artin braid
     # that swaps the two contracted legs, producing the R-symbol that A encodes. Naively
@@ -234,7 +234,7 @@ function planarcontract!(
         B′ = TO.tensoralloc_add(
             scalartype(B), B, (cindB, oindB), false, Val(true), allocator
         )
-        add_transpose!(B′, B, (cindB, oindB), One(), Zero(), backend)
+        transpose!(B′, B, (cindB, oindB), One(), Zero(), backend, allocator)
     end
 
     levelsA = A.adjoint ? (1, 2, 2, 1) : (2, 1, 1, 2)
@@ -244,9 +244,9 @@ function planarcontract!(
         ntuple(Returns(3), N - 2)...,
     )
 
-    add_braid!(
+    braid!(
         C, B′, ((2, 1), ntuple(i -> i + 2, N - 2)),
-        levels, α, β, backend,
+        levels, α, β, backend, allocator
     )
 
     B_in_layout || TO.tensorfree!(B′, allocator)
@@ -288,24 +288,13 @@ function planarcontract!(
         A′ = TO.tensoralloc_add(
             scalartype(A), A, (oindA, cindA), false, Val(true), allocator
         )
-        add_transpose!(A′, A, (oindA, cindA), One(), Zero(), backend)
+        transpose!(A′, A, (oindA, cindA), One(), Zero(), backend, allocator)
     end
 
-    levelsB = B.adjoint ? (1, 2, 2, 1) : (2, 1, 1, 2)
-    N = numind(A)
-    M = N - 2
-    levels = (
-        ntuple(Returns(3), M)...,
-        levelsB[cindB[1]], levelsB[cindB[2]],
-    )
-
-    add_braid!(
-        C, A′, (ntuple(identity, M), (N, N - 1)),
-        levels, α, β, backend,
-    )
-
-    A_in_layout || TO.tensorfree!(A′, allocator)
-    return C
+    p = (oindA, reverse(cindA))
+    N = length(oindA)
+    levels = (ntuple(identity, N)..., (B.adjoint ? (N + 1, N + 2) : (N + 2, N + 1))...)
+    return braid!(C, A, p, levels, α, β, backend, allocator)
 end
 
 # ambiguity fix:
diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl
index 3108abb17..ce998f396 100644
--- a/src/tensors/indexmanipulations.jl
+++ b/src/tensors/indexmanipulations.jl
@@ -47,22 +47,29 @@ function flip(t::AbstractTensorMap, I; inv::Bool = false)
 end
 
 """
-    permute!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap, (p₁, p₂)::Index2Tuple)
+    permute!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap, (p₁, p₂)::Index2Tuple,
+             α::Number=One(), β::Number=Zero(), backend::AbstractBackend...;
+             allocator=TO.DefaultAllocator())
         -> tdst
 
-Write into `tdst` the result of permuting the indices of `tsrc`.
+Write into `tdst` the result of adding `α * tsrc` to `β * tdst` after permuting the indices of `tsrc`.
 The codomain and domain of `tdst` correspond to the indices in `p₁` and `p₂` of `tsrc` respectively.
-                
-See [`permute`](@ref) for creating a new tensor and [`add_permute!`](@ref) for a more general version.
+
+See [`permute`](@ref) for creating a new tensor.
 """
 @propagate_inbounds function Base.permute!(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple
+        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple,
+        α::Number=One(), β::Number=Zero(), backend::AbstractBackend...;
+        allocator=TO.DefaultAllocator()
     )
-    return add_permute!(tdst, tsrc, p, One(), Zero())
+    @boundscheck spacecheck_transform(permute, tdst, tsrc, p)
+    transformer = treepermuter(tdst, tsrc, p)
+    return @inbounds add_transform!(tdst, tsrc, p, transformer, α, β, backend...; allocator)
 end
 
 """
-    permute(tsrc::AbstractTensorMap, (p₁, p₂)::Index2Tuple; copy::Bool = false) -> tdst::TensorMap
+    permute(tsrc::AbstractTensorMap, (p₁, p₂)::Index2Tuple; copy::Bool = false,
+            allocator=TO.DefaultAllocator()) -> tdst::TensorMap
 
 Return tensor `tdst` obtained by permuting the indices of `tsrc`.
 The codomain and domain of `tdst` correspond to the indices in `p₁` and `p₂` of `tsrc` respectively.
@@ -70,9 +77,11 @@ The codomain and domain of `tdst` correspond to the indices in `p₁` and `p₂`
 If `copy = false`, `tdst` might share data with `tsrc` whenever possible.
 Otherwise, a copy is always made.
 
-To permute into an existing destination, see [permute!](@ref) and [`add_permute!`](@ref)
+To permute into an existing destination, see [permute!](@ref)
 """
-function permute(t::AbstractTensorMap, p::Index2Tuple; copy::Bool = false)
+function permute(
+        t::AbstractTensorMap, p::Index2Tuple; copy::Bool = false, allocator=TO.DefaultAllocator()
+    )
     # share data if possible
     if !copy
         if p == (codomainind(t), domainind(t))
@@ -84,14 +93,15 @@ function permute(t::AbstractTensorMap, p::Index2Tuple; copy::Bool = false)
 
     # general case
     tdst = similar(t, promote_permute(t), permute(space(t), p))
-    return @inbounds permute!(tdst, t, p)
+    return @inbounds permute!(tdst, t, p; allocator)
 end
-function permute(t::AdjointTensorMap, (p₁, p₂)::Index2Tuple; copy::Bool = false)
+function permute(t::AdjointTensorMap, (p₁, p₂)::Index2Tuple; copy::Bool = false, allocator=TO.DefaultAllocator())
     p₁′ = adjointtensorindices(t, p₂)
     p₂′ = adjointtensorindices(t, p₁)
-    return adjoint(permute(adjoint(t), (p₁′, p₂′); copy))
+    return adjoint(permute(adjoint(t), (p₁′, p₂′); copy, allocator))
 end
-permute(t::AbstractTensorMap, p::IndexTuple; copy::Bool = false) = permute(t, (p, ()); copy)
+permute(t::AbstractTensorMap, p::IndexTuple; copy::Bool = false, allocator=TO.DefaultAllocator()) =
+    permute(t, (p, ()); copy, allocator)
 
 function has_shared_permute(t::AbstractTensorMap, (p₁, p₂)::Index2Tuple)
     return (p₁ === codomainind(t) && p₂ === domainind(t))
@@ -118,25 +128,33 @@ end
 # Braid
 """
     braid!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap,
-           (p₁, p₂)::Index2Tuple, levels::Tuple)
+           (p₁, p₂)::Index2Tuple, levels::IndexTuple,
+           α::Number=One(), β::Number=Zero(), backend::AbstractBackend...;
+           allocator=TO.DefaultAllocator())
         -> tdst
 
-Write into `tdst` the result of braiding the indices of `tsrc`.
+Write into `tdst` the result of adding `α * tsrc` to `β * tdst` after braiding the indices of `tsrc`.
 The codomain and domain of `tdst` correspond to the indices in `p₁` and `p₂` of `tsrc` respectively.
 Here, `levels` is a tuple of length `numind(tsrc)` that assigns a level or height to the indices of `tsrc`,
 which determines whether they will braid over or under any other index with which they have to change places.
 
-See [`braid`](@ref) for creating a new tensor and [`add_braid!`](@ref) for a more general version.
+See [`braid`](@ref) for creating a new tensor.
 """
 @propagate_inbounds function braid!(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, levels::IndexTuple
+        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, levels::IndexTuple,
+        α::Number=One(), β::Number=Zero(), backend::AbstractBackend...;
+        allocator=TO.DefaultAllocator()
     )
-    return add_braid!(tdst, tsrc, p, levels, One(), Zero())
+    @boundscheck spacecheck_transform(braid, tdst, tsrc, p, levels)
+    levels1 = TupleTools.getindices(levels, codomainind(tsrc))
+    levels2 = TupleTools.getindices(levels, domainind(tsrc))
+    transformer = treebraider(tdst, tsrc, p, (levels1, levels2))
+    return @inbounds add_transform!(tdst, tsrc, p, transformer, α, β, backend...; allocator)
 end
 
 """
     braid(tsrc::AbstractTensorMap, (p₁, p₂)::Index2Tuple, levels::IndexTuple;
-          copy::Bool = false)
+          copy::Bool = false, allocator=TO.DefaultAllocator())
         -> tdst::TensorMap
 
 Return tensor `tdst` obtained by braiding the indices of `tsrc`.
@@ -146,19 +164,20 @@ which determines whether they will braid over or under any other index with whic
 
 If `copy=false`, `tdst` might share data with `tsrc` whenever possible. Otherwise, a copy is always made.
 
-To braid into an existing destination, see [braid!](@ref) and [`add_braid!`](@ref)
+To braid into an existing destination, see [braid!](@ref)
 """
 function braid(
-        t::AbstractTensorMap, p::Index2Tuple, levels::IndexTuple; copy::Bool = false
+        t::AbstractTensorMap, p::Index2Tuple, levels::IndexTuple;
+        copy::Bool = false, allocator=TO.DefaultAllocator()
     )
     length(levels) == numind(t) || throw(ArgumentError("invalid levels"))
 
-    BraidingStyle(sectortype(t)) isa SymmetricBraiding && return permute(t, p; copy)
+    BraidingStyle(sectortype(t)) isa SymmetricBraiding && return permute(t, p; copy, allocator)
     (!copy && p == (codomainind(t), domainind(t))) && return t
 
     # general case
     tdst = similar(t, promote_braid(t), permute(space(t), p))
-    return @inbounds braid!(tdst, t, p, levels)
+    return @inbounds braid!(tdst, t, p, levels; allocator)
 end
 # TODO: braid for `AdjointTensorMap`; think about how to map the `levels` argument.
 
@@ -167,25 +186,32 @@ _transpose_indices(t::AbstractTensorMap) = (reverse(domainind(t)), reverse(codom
 
 """
     transpose!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap,
-               (p₁, p₂)::Index2Tuple)
+               (p₁, p₂)::Index2Tuple,
+               α::Number=One(), β::Number=Zero(), backend::AbstractBackend...;
+               allocator=TO.DefaultAllocator())
         -> tdst
 
-Write into `tdst` the result of transposing the indices of `tsrc`.
+Write into `tdst` the result of adding `α * tsrc` to `β * tdst` after transposing the indices of `tsrc`.
 The codomain and domain of `tdst` correspond to the indices in `p₁` and `p₂` of `tsrc` respectively.
 The new index positions should be attainable without any indices crossing each other, i.e.,
 the permutation `(p₁..., reverse(p₂)...)` should constitute a cyclic permutation of `(codomainind(tsrc)..., reverse(domainind(tsrc))...)`.
 
-See [`transpose`](@ref) for creating a new tensor and [`add_transpose!`](@ref) for a more general version.
+See [`transpose`](@ref) for creating a new tensor.
 """
 @propagate_inbounds function LinearAlgebra.transpose!(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, (p₁, p₂)::Index2Tuple = _transpose_indices(tsrc)
+        tdst::AbstractTensorMap, tsrc::AbstractTensorMap,
+        p::Index2Tuple = _transpose_indices(tsrc),
+        α::Number=One(), β::Number=Zero(), backend::AbstractBackend...;
+        allocator=TO.DefaultAllocator()
     )
-    return add_transpose!(tdst, tsrc, (p₁, p₂), One(), Zero())
+    @boundscheck spacecheck_transform(transpose, tdst, tsrc, p)
+    transformer = treetransposer(tdst, tsrc, p)
+    return @inbounds add_transform!(tdst, tsrc, p, transformer, α, β, backend...; allocator)
 end
 
 """
     transpose(tsrc::AbstractTensorMap, (p₁, p₂)::Index2Tuple;
-              copy::Bool=false)
+              copy::Bool=false, allocator=TO.DefaultAllocator())
         -> tdst::TensorMap
 
 Return tensor `tdst` obtained by transposing the indices of `tsrc`.
@@ -195,50 +221,59 @@ the permutation `(p₁..., reverse(p₂)...)` should constitute a cyclic permuta
 
 If `copy=false`, `tdst` might share data with `tsrc` whenever possible. Otherwise, a copy is always made.
 
-To permute into an existing destination, see [permute!](@ref) and [`add_permute!`](@ref)
+To transpose into an existing destination, see [transpose!](@ref)
 """
 function LinearAlgebra.transpose(
         t::AbstractTensorMap, p::Index2Tuple = _transpose_indices(t);
-        copy::Bool = false
+        copy::Bool = false, allocator=TO.DefaultAllocator()
     )
-    sectortype(t) === Trivial && return permute(t, p; copy)
+    sectortype(t) === Trivial && return permute(t, p; copy, allocator)
     (!copy && p == (codomainind(t), domainind(t))) && return t
 
     # general case
     tdst = similar(t, promote_transpose(t), permute(space(t), p))
-    return @inbounds transpose!(tdst, t, p)
+    return @inbounds transpose!(tdst, t, p; allocator)
 end
 
 function LinearAlgebra.transpose(
         t::AdjointTensorMap, (p₁, p₂)::Index2Tuple = _transpose_indices(t);
-        copy::Bool = false
+        copy::Bool = false, allocator=TO.DefaultAllocator()
     )
     p₁′ = map(n -> adjointtensorindex(t, n), p₂)
     p₂′ = map(n -> adjointtensorindex(t, n), p₁)
-    return adjoint(transpose(adjoint(t), (p₁′, p₂′); copy = copy))
+    return adjoint(transpose(adjoint(t), (p₁′, p₂′); copy, allocator))
 end
 
 """
-    repartition!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap) -> tdst
+    repartition!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap,
+                 α::Number=One(), β::Number=Zero(), backend::AbstractBackend...;
+                 allocator=TO.DefaultAllocator())
+        -> tdst
 
-Write into `tdst` the result of repartitioning the indices of `tsrc`. This is just a special
-case of a transposition that only changes the number of in- and outgoing indices.
+Write into `tdst` the result of adding `α * tsrc` to `β * tdst` after repartitioning the indices of
+`tsrc`. This is just a special case of a transposition that only changes the number of in- and
+outgoing indices.
 
 See [`repartition`](@ref) for creating a new tensor.
 """
-@propagate_inbounds function repartition!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap)
+@propagate_inbounds function repartition!(
+        tdst::AbstractTensorMap, tsrc::AbstractTensorMap,
+        α::Number=One(), β::Number=Zero(), backend::AbstractBackend...;
+        allocator=TO.DefaultAllocator()
+    )
     check_spacetype(tdst, tsrc)
     numind(tsrc) == numind(tdst) ||
         throw(ArgumentError("tsrc and tdst should have an equal amount of indices"))
     all_inds = (codomainind(tsrc)..., reverse(domainind(tsrc))...)
     p₁ = ntuple(i -> all_inds[i], numout(tdst))
     p₂ = reverse(ntuple(i -> all_inds[i + numout(tdst)], numin(tdst)))
-    return transpose!(tdst, tsrc, (p₁, p₂))
+    return transpose!(tdst, tsrc, (p₁, p₂), α, β, backend...; allocator)
 end
 
 """
     repartition(
-        tsrc::AbstractTensorMap{T, S}, N₁::Int, N₂::Int; copy::Bool=false
+        tsrc::AbstractTensorMap{T, S}, N₁::Int, N₂::Int; copy::Bool=false,
+        allocator=TO.DefaultAllocator()
     ) where {T, S} -> tdst::AbstractTensorMap{T, S, N₁, N₂}
 
 Return tensor `tdst` obtained by repartitioning the indices of `t`.
@@ -249,14 +284,15 @@ If `copy=false`, `tdst` might share data with `tsrc` whenever possible. Otherwis
 To repartition into an existing destination, see [repartition!](@ref).
 """
 @constprop :aggressive function repartition(
-        t::AbstractTensorMap, N₁::Int, N₂::Int = numind(t) - N₁; copy::Bool = false
+        t::AbstractTensorMap, N₁::Int, N₂::Int = numind(t) - N₁;
+        copy::Bool = false, allocator=TO.DefaultAllocator()
     )
     N₁ + N₂ == numind(t) ||
         throw(ArgumentError("Invalid repartition: $(numind(t)) to ($N₁, $N₂)"))
     all_inds = (codomainind(t)..., reverse(domainind(t))...)
     p₁ = ntuple(i -> all_inds[i], N₁)
     p₂ = reverse(ntuple(i -> all_inds[i + N₁], N₂))
-    return transpose(t, (p₁, p₂); copy)
+    return transpose(t, (p₁, p₂); copy, allocator)
 end
 
 # Twist
@@ -394,7 +430,7 @@ For this to work, that factor has to be isomorphic to the field of scalars.
 
 If `copy=false`, `tdst` might share data with `tsrc` whenever possible. Otherwise, a copy is always made.
 
-This operation undoes the work of [`insertleftunit`](@ref insertleftunit(::AbstractTensorMap, ::Val{i}) where {i}) 
+This operation undoes the work of [`insertleftunit`](@ref insertleftunit(::AbstractTensorMap, ::Val{i}) where {i})
 and [`insertrightunit`](@ref insertrightunit(::AbstractTensorMap, ::Val{i}) where {i}).
 """
 function removeunit(t::AbstractTensorMap, ::Val{i}; copy::Bool = false) where {i}
@@ -447,67 +483,56 @@ end
     return nothing
 end
 
-
+# Deprecated add_*! wrappers
+# --------------------------
 """
-    add_permute!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap, (p₁, p₂)::Index2Tuple,
-                 α::Number, β::Number, backend::AbstractBackend...)
-
-Return the updated `tdst`, which is the result of adding `α * tsrc` to `tdst` after permuting 
-the indices of `tsrc` according to `(p₁, p₂)`.
+    add_permute!(tdst, tsrc, (p₁, p₂)::Index2Tuple, α::Number, β::Number, backend::AbstractBackend...)
 
-See also [`permute`](@ref), [`permute!`](@ref), [`add_braid!`](@ref), [`add_transpose!`](@ref).
+!!! warning "Deprecated"
+    `add_permute!` is deprecated. Use `permute!(tdst, tsrc, p, α, β, backend...)` instead.
 """
-@propagate_inbounds function add_permute!(
+function add_permute!(
         tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple,
         α::Number, β::Number, backend::AbstractBackend...
     )
-    @boundscheck spacecheck_transform(permute, tdst, tsrc, p)
-    transformer = treepermuter(tdst, tsrc, p)
-    return @inbounds add_transform!(tdst, tsrc, p, transformer, α, β, backend...)
+    Base.depwarn("`add_permute!` is deprecated, use `permute!` instead", :add_permute!)
+    return @inbounds permute!(tdst, tsrc, p, α, β, backend...)
 end
 
 """
-    add_braid!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap, (p₁, p₂)::Index2Tuple,
-               levels::IndexTuple, α::Number, β::Number, backend::AbstractBackend...)
+    add_braid!(tdst, tsrc, (p₁, p₂)::Index2Tuple, levels::IndexTuple, α::Number, β::Number,
+               backend::AbstractBackend...)
 
-Return the updated `tdst`, which is the result of adding `α * tsrc` to `tdst` after braiding
-the indices of `tsrc` according to `(p₁, p₂)` and `levels`.
-
-See also [`braid`](@ref), [`braid!`](@ref), [`add_permute!`](@ref), [`add_transpose!`](@ref).
+!!! warning "Deprecated"
+    `add_braid!` is deprecated. Use `braid!(tdst, tsrc, p, levels, α, β, backend...)` instead.
 """
-@propagate_inbounds function add_braid!(
+function add_braid!(
         tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, levels::IndexTuple,
         α::Number, β::Number, backend::AbstractBackend...
     )
-    @boundscheck spacecheck_transform(braid, tdst, tsrc, p, levels)
-    levels1 = TupleTools.getindices(levels, codomainind(tsrc))
-    levels2 = TupleTools.getindices(levels, domainind(tsrc))
-    # TODO: arg order for tensormaps is different than for fusiontrees
-    transformer = treebraider(tdst, tsrc, p, (levels1, levels2))
-    return @inbounds add_transform!(tdst, tsrc, p, transformer, α, β, backend...)
+    Base.depwarn("`add_braid!` is deprecated, use `braid!` instead", :add_braid!)
+    return @inbounds braid!(tdst, tsrc, p, levels, α, β, backend...)
 end
 
 """
-    add_transpose!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap, (p₁, p₂)::Index2Tuple,
-                   α::Number, β::Number, backend::AbstractBackend...)
+    add_transpose!(tdst, tsrc, (p₁, p₂)::Index2Tuple, α::Number, β::Number,
+                   backend::AbstractBackend...)
 
-Return the updated `tdst`, which is the result of adding `α * tsrc` to `tdst` after transposing
-the indices of `tsrc` according to `(p₁, p₂)`.
-
-See also [`transpose`](@ref), [`transpose!`](@ref), [`add_permute!`](@ref), [`add_braid!`](@ref).
+!!! warning "Deprecated"
+    `add_transpose!` is deprecated. Use `transpose!(tdst, tsrc, p, α, β, backend...)` instead.
 """
-@propagate_inbounds function add_transpose!(
+function add_transpose!(
         tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple,
         α::Number, β::Number, backend::AbstractBackend...
     )
-    @boundscheck spacecheck_transform(transpose, tdst, tsrc, p)
-    transformer = treetransposer(tdst, tsrc, p)
-    return @inbounds add_transform!(tdst, tsrc, p, transformer, α, β, backend...)
+    Base.depwarn("`add_transpose!` is deprecated, use `transpose!` instead", :add_transpose!)
+    return @inbounds transpose!(tdst, tsrc, p, α, β, backend...)
 end
 
 @propagate_inbounds function add_transform!(
         tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, transformer,
-        α::Number, β::Number, backend::AbstractBackend...
+        α::Number, β::Number, backend::AbstractBackend...;
+        allocator=TO.DefaultAllocator()
     )
     @boundscheck spacecheck_transform(permute, tdst, tsrc, p)
 
@@ -515,14 +540,15 @@ end
         add!(tdst, tsrc, α, β)
     else
         I = sectortype(tdst)
+        _backend = isempty(backend) ? TO.DefaultBackend() : only(backend)
         if I === Trivial
-            add_trivial_kernel!(tdst, tsrc, p, transformer, α, β, backend...)
+            add_trivial_kernel!(tdst, tsrc, p, transformer, α, β, _backend; allocator)
         else
             style = FusionStyle(I)
             if use_threaded_transform(tdst, transformer)
-                add_kernel_threaded!(style, tdst, tsrc, p, transformer, α, β, backend...)
+                add_kernel_threaded!(style, tdst, tsrc, p, transformer, α, β, _backend; allocator)
             else
-                add_kernel_nonthreaded!(style, tdst, tsrc, p, transformer, α, β, backend...)
+                add_kernel_nonthreaded!(style, tdst, tsrc, p, transformer, α, β, _backend; allocator)
             end
         end
     end
@@ -539,70 +565,75 @@ end
 
 # Trivial implementations
 # -----------------------
-function add_trivial_kernel!(tdst, tsrc, p, transformer, α, β, backend...)
-    TO.tensoradd!(tdst[], tsrc[], p, false, α, β, backend...)
+function add_trivial_kernel!(tdst, tsrc, p, transformer, α, β, backend; allocator=TO.DefaultAllocator())
+    TO.tensoradd!(tdst[], tsrc[], p, false, α, β, backend, allocator)
     return nothing
 end
 
 # Non-threaded implementations
 # ----------------------------
 function add_kernel_nonthreaded!(
-        ::UniqueFusion, tdst, tsrc, p, transformer, α, β, backend...
+        ::UniqueFusion, tdst, tsrc, p, transformer, α, β, backend; allocator=TO.DefaultAllocator()
     )
     for (f₁, f₂) in fusiontrees(tsrc)
-        _add_transform_single!(tdst, tsrc, p, (f₁, f₂), transformer, α, β, backend...)
+        _add_transform_single!(tdst, tsrc, p, (f₁, f₂), transformer, α, β, backend; allocator)
     end
     return nothing
 end
 function add_kernel_nonthreaded!(
-        ::UniqueFusion, tdst, tsrc, p, transformer::AbelianTreeTransformer, α, β, backend...
+        ::UniqueFusion, tdst, tsrc, p, transformer::AbelianTreeTransformer, α, β, backend;
+        allocator=TO.DefaultAllocator()
     )
     for subtransformer in transformer.data
-        _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend...)
+        _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend; allocator)
     end
     return nothing
 end
-function add_kernel_nonthreaded!(::FusionStyle, tdst, tsrc, p, transformer, α, β, backend...)
+function add_kernel_nonthreaded!(
+        ::FusionStyle, tdst, tsrc, p, transformer, α, β, backend; allocator=TO.DefaultAllocator()
+    )
     # preallocate buffers
-    buffers = allocate_buffers(tdst, tsrc, transformer)
+    buffers = allocate_buffers(tdst, tsrc, transformer; allocator)
 
     for src in fusionblocks(tsrc)
         if length(src) == 1
-            _add_transform_single!(tdst, tsrc, p, src, transformer, α, β, backend...)
+            _add_transform_single!(tdst, tsrc, p, src, transformer, α, β, backend; allocator)
         else
-            _add_transform_multi!(tdst, tsrc, p, src, transformer, buffers, α, β, backend...)
+            _add_transform_multi!(tdst, tsrc, p, src, transformer, buffers, α, β, backend; allocator)
         end
     end
     return nothing
 end
 # specialization in the case of TensorMap
 function add_kernel_nonthreaded!(
-        ::FusionStyle, tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend...
+        ::FusionStyle, tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend;
+        allocator=TO.DefaultAllocator()
     )
     # preallocate buffers
-    buffers = allocate_buffers(tdst, tsrc, transformer)
+    buffers = allocate_buffers(tdst, tsrc, transformer; allocator)
 
     for subtransformer in transformer.data
         # Special case without intermediate buffers whenever there is only a single block
         if length(subtransformer[1]) == 1
-            _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend...)
+            _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend; allocator)
         else
-            _add_transform_multi!(tdst, tsrc, p, subtransformer, buffers, α, β, backend...)
+            _add_transform_multi!(tdst, tsrc, p, subtransformer, buffers, α, β, backend; allocator)
         end
     end
     return nothing
 end
 # ambiguity resolution
 function add_kernel_nonthreaded!(
-        ::UniqueFusion, tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend...
+        ::UniqueFusion, tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend;
+        allocator=TO.DefaultAllocator()
     )
     throw(ArgumentError("Cannot combine `GenericTreeTransformer` with `UniqueFusion`"))
 end
 # Threaded implementations
 # ------------------------
 function add_kernel_threaded!(
-        ::UniqueFusion, tdst, tsrc, p, transformer, α, β, backend...;
-        ntasks::Int = get_num_transformer_threads()
+        ::UniqueFusion, tdst, tsrc, p, transformer, α, β, backend;
+        ntasks::Int = get_num_transformer_threads(), allocator=TO.DefaultAllocator()
     )
     trees = fusiontrees(tsrc)
     nblocks = length(trees)
@@ -613,15 +644,15 @@ function add_kernel_threaded!(
                 local_counter = Threads.atomic_add!(counter, 1)
                 local_counter > nblocks && break
                 @inbounds (f₁, f₂) = trees[local_counter]
-                _add_transform_single!(tdst, tsrc, p, (f₁, f₂), transformer, α, β, backend...)
+                _add_transform_single!(tdst, tsrc, p, (f₁, f₂), transformer, α, β, backend; allocator)
             end
         end
     end
     return nothing
 end
 function add_kernel_threaded!(
-        ::UniqueFusion, tdst, tsrc, p, transformer::AbelianTreeTransformer, α, β, backend...;
-        ntasks::Int = get_num_transformer_threads()
+        ::UniqueFusion, tdst, tsrc, p, transformer::AbelianTreeTransformer, α, β, backend;
+        ntasks::Int = get_num_transformer_threads(), allocator=TO.DefaultAllocator()
     )
     nblocks = length(transformer.data)
     counter = Threads.Atomic{Int}(1)
@@ -631,7 +662,7 @@ function add_kernel_threaded!(
                 local_counter = Threads.atomic_add!(counter, 1)
                 local_counter > nblocks && break
                 @inbounds subtransformer = transformer.data[local_counter]
-                _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend...)
+                _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend; allocator)
             end
         end
     end
@@ -639,8 +670,8 @@ function add_kernel_threaded!(
 end
 
 function add_kernel_threaded!(
-        ::FusionStyle, tdst, tsrc, p, transformer, α, β, backend...;
-        ntasks::Int = get_num_transformer_threads()
+        ::FusionStyle, tdst, tsrc, p, transformer, α, β, backend;
+        ntasks::Int = get_num_transformer_threads(), allocator=TO.DefaultAllocator()
     )
     allblocks = fusionblocks(tsrc)
     nblocks = length(allblocks)
@@ -649,16 +680,16 @@ function add_kernel_threaded!(
     Threads.@sync for _ in 1:min(ntasks, nblocks)
         Threads.@spawn begin
             # preallocate buffers for each task
-            buffers = allocate_buffers(tdst, tsrc, transformer)
+            buffers = allocate_buffers(tdst, tsrc, transformer; allocator)
 
             while true
                 local_counter = Threads.atomic_add!(counter, 1)
                 local_counter > nblocks && break
                 @inbounds src = allblocks[local_counter]
                 if length(src) == 1
-                    _add_transform_single!(tdst, tsrc, p, src, transformer, α, β, backend...)
+                    _add_transform_single!(tdst, tsrc, p, src, transformer, α, β, backend; allocator)
                 else
-                    _add_transform_multi!(tdst, tsrc, p, src, transformer, buffers, α, β, backend...)
+                    _add_transform_multi!(tdst, tsrc, p, src, transformer, buffers, α, β, backend; allocator)
                 end
             end
         end
@@ -668,8 +699,8 @@ function add_kernel_threaded!(
 end
 # specialization in the case of TensorMap
 function add_kernel_threaded!(
-        ::FusionStyle, tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend...;
-        ntasks::Int = get_num_transformer_threads()
+        ::FusionStyle, tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend;
+        ntasks::Int = get_num_transformer_threads(), allocator=TO.DefaultAllocator()
     )
     nblocks = length(transformer.data)
 
@@ -677,16 +708,16 @@ function add_kernel_threaded!(
     Threads.@sync for _ in 1:min(ntasks, nblocks)
         Threads.@spawn begin
             # preallocate buffers for each task
-            buffers = allocate_buffers(tdst, tsrc, transformer)
+            buffers = allocate_buffers(tdst, tsrc, transformer; allocator)
 
             while true
                 local_counter = Threads.atomic_add!(counter, 1)
                 local_counter > nblocks && break
                 @inbounds subtransformer = transformer.data[local_counter]
                 if length(subtransformer[1]) == 1
-                    _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend...)
+                    _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend; allocator)
                 else
-                    _add_transform_multi!(tdst, tsrc, p, subtransformer, buffers, α, β, backend...)
+                    _add_transform_multi!(tdst, tsrc, p, subtransformer, buffers, α, β, backend; allocator)
                 end
             end
         end
@@ -696,8 +727,8 @@ function add_kernel_threaded!(
 end
 # ambiguity resolution
 function add_kernel_threaded!(
-        ::UniqueFusion, tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend...;
-        ntasks::Int = get_num_transformer_threads()
+        ::UniqueFusion, tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend;
+        ntasks::Int = get_num_transformer_threads(), allocator=TO.DefaultAllocator()
     )
     throw(ArgumentError("Cannot combine `GenericTreeTransformer` with `UniqueFusion`"))
 end
@@ -705,40 +736,49 @@ end
 
 # Auxiliary methods
 # -----------------
-function _add_transform_single!(tdst, tsrc, p, (f₁, f₂)::FusionTreePair, transformer, α, β, backend...)
+function _add_transform_single!(
+        tdst, tsrc, p, (f₁, f₂)::FusionTreePair, transformer, α, β, backend;
+        allocator=TO.DefaultAllocator()
+    )
     (f₁′, f₂′), coeff = transformer((f₁, f₂))
-    @inbounds TO.tensoradd!(tdst[f₁′, f₂′], tsrc[f₁, f₂], p, false, α * coeff, β, backend...)
+    @inbounds TO.tensoradd!(tdst[f₁′, f₂′], tsrc[f₁, f₂], p, false, α * coeff, β, backend, allocator)
     return nothing
 end
-function _add_transform_single!(tdst, tsrc, p, src::FusionTreeBlock, transformer, α, β, backend...)
+function _add_transform_single!(
+        tdst, tsrc, p, src::FusionTreeBlock, transformer, α, β, backend;
+        allocator=TO.DefaultAllocator()
+    )
     dst, U = transformer(src)
     f₁, f₂ = only(fusiontrees(src))
     f₁′, f₂′ = only(fusiontrees(dst))
     coeff = only(U)
-    @inbounds TO.tensoradd!(tdst[f₁′, f₂′], tsrc[f₁, f₂], p, false, α * coeff, β, backend...)
+    @inbounds TO.tensoradd!(tdst[f₁′, f₂′], tsrc[f₁, f₂], p, false, α * coeff, β, backend, allocator)
     return nothing
 end
 function _add_transform_single!(
         tdst, tsrc, p, (coeff, struct_dst, struct_src)::AbelianTransformerData,
-        α, β, backend...
+        α, β, backend; allocator=TO.DefaultAllocator()
     )
     subblock_dst = StridedView(tdst.data, struct_dst...)
     subblock_src = StridedView(tsrc.data, struct_src...)
-    TO.tensoradd!(subblock_dst, subblock_src, p, false, α * coeff, β, backend...)
+    TO.tensoradd!(subblock_dst, subblock_src, p, false, α * coeff, β, backend, allocator)
     return nothing
 end
 function _add_transform_single!(
         tdst, tsrc, p, (basistransform, structs_dst, structs_src)::GenericTransformerData,
-        α, β, backend...
+        α, β, backend; allocator=TO.DefaultAllocator()
     )
     struct_dst = (structs_dst[1], only(structs_dst[2])...)
     struct_src = (structs_src[1], only(structs_src[2])...)
     coeff = only(basistransform)
-    _add_transform_single!(tdst, tsrc, p, (coeff, struct_dst, struct_src), α, β, backend...)
+    _add_transform_single!(tdst, tsrc, p, (coeff, struct_dst, struct_src), α, β, backend; allocator)
     return nothing
 end
 
-function _add_transform_multi!(tdst, tsrc, p, src::FusionTreeBlock, transformer, (buffer1, buffer2), α, β, backend...)
+function _add_transform_multi!(
+        tdst, tsrc, p, src::FusionTreeBlock, transformer, (buffer1, buffer2), α, β, backend;
+        allocator=TO.DefaultAllocator()
+    )
     dst, U = transformer(src)
     rows, cols = size(U)
     sz_src = size(tsrc[first(fusiontrees(src))...])
@@ -764,14 +804,14 @@ function _add_transform_multi!(tdst, tsrc, p, src::FusionTreeBlock, transformer,
     for (i, (f₃, f₄)) in enumerate(fusiontrees(dst))
         subblock_dst = tdst[f₃, f₄]
         bufblock_dst = sreshape(buffer_dst[:, i], sz_src)
-        TO.tensoradd!(subblock_dst, bufblock_dst, p, false, One(), β, backend...)
+        TO.tensoradd!(subblock_dst, bufblock_dst, p, false, One(), β, backend, allocator)
     end
 
     return nothing
 end
 function _add_transform_multi!(
         tdst, tsrc, p, (U, (sz_dst, structs_dst), (sz_src, structs_src)),
-        (buffer1, buffer2), α, β, backend...
+        (buffer1, buffer2), α, β, backend; allocator=TO.DefaultAllocator()
     )
     rows, cols = size(U)
     blocksize = prod(sz_src)
@@ -796,7 +836,7 @@ function _add_transform_multi!(
     for (i, struct_dst) in enumerate(structs_dst)
         subblock_dst = StridedView(tdst.data, sz_dst, struct_dst...)
         bufblock_dst = sreshape(buffer_dst[:, i], sz_src)
-        TO.tensoradd!(subblock_dst, bufblock_dst, p, false, One(), β, backend...)
+        TO.tensoradd!(subblock_dst, bufblock_dst, p, false, One(), β, backend, allocator)
     end
 
     return nothing
diff --git a/src/tensors/tensoroperations.jl b/src/tensors/tensoroperations.jl
index 3fc79cf0c..375b63768 100644
--- a/src/tensors/tensoroperations.jl
+++ b/src/tensors/tensoroperations.jl
@@ -43,9 +43,9 @@ function TO.tensoradd!(
     if conjA
         A′ = adjoint(A)
         pA′ = adjointtensorindices(A, _canonicalize(pA, C))
-        add_permute!(C, A′, pA′, α, β, backend)
+        permute!(C, A′, pA′, α, β, backend)
     else
-        add_permute!(C, A, _canonicalize(pA, C), α, β, backend)
+        permute!(C, A, _canonicalize(pA, C), α, β, backend)
     end
     return C
 end
diff --git a/src/tensors/treetransformers.jl b/src/tensors/treetransformers.jl
index 30ec1de0f..d796754c3 100644
--- a/src/tensors/treetransformers.jl
+++ b/src/tensors/treetransformers.jl
@@ -135,13 +135,15 @@ function buffersize(transformer::GenericTreeTransformer)
 end
 
 function allocate_buffers(
-        tdst::TensorMap, tsrc::TensorMap, transformer::GenericTreeTransformer
+        tdst::TensorMap, tsrc::TensorMap, transformer::GenericTreeTransformer;
+        allocator=TO.DefaultAllocator()
     )
     sz = buffersize(transformer)
     return similar(tdst.data, sz), similar(tsrc.data, sz)
 end
 function allocate_buffers(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, transformer
+        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, transformer;
+        allocator=TO.DefaultAllocator()
     )
     # be pessimistic and assume the worst for now
     sz = dim(space(tsrc))
diff --git a/test/mooncake/indexmanipulations.jl b/test/mooncake/indexmanipulations.jl
index 4dd6413cf..390721d71 100644
--- a/test/mooncake/indexmanipulations.jl
+++ b/test/mooncake/indexmanipulations.jl
@@ -18,7 +18,7 @@ eltypes = (Float64, ComplexF64)
     hasbraiding = BraidingStyle(sectortype(eltype(V))) isa HasBraiding
     symmetricbraiding = BraidingStyle(sectortype(eltype(V))) isa SymmetricBraiding
 
-    symmetricbraiding && @timedtestset "add_permute!" begin
+    symmetricbraiding && @timedtestset "permute!" begin
         A = randn(T, V[1] ⊗ V[2] ← (V[3] ⊗ V[4] ⊗ V[5])')
         α = randn(T)
         β = randn(T)
@@ -27,12 +27,12 @@ eltypes = (Float64, ComplexF64)
         for _ in 1:5
             p = randindextuple(numind(A))
             C = randn!(permute(A, p))
-            Mooncake.TestUtils.test_rule(rng, TensorKit.add_permute!, C, A, p, α, β; atol, rtol, mode)
+            Mooncake.TestUtils.test_rule(rng, TensorKit.permute!, C, A, p, α, β; atol, rtol, mode)
             A = C
         end
     end
 
-    @timedtestset "add_transpose!" begin
+    @timedtestset "transpose!" begin
         A = randn(T, V[1] ⊗ V[2] ← (V[3] ⊗ V[4] ⊗ V[5])')
         α = randn(T)
         β = randn(T)
@@ -41,18 +41,18 @@ eltypes = (Float64, ComplexF64)
         for _ in 1:2
             p = randcircshift(numout(A), numin(A))
             C = randn!(transpose(A, p))
-            Mooncake.TestUtils.test_rule(rng, TensorKit.add_transpose!, C, A, p, One(), Zero(); atol, rtol, mode)
-            Mooncake.TestUtils.test_rule(rng, TensorKit.add_transpose!, C, A, p, α, β; atol, rtol, mode)
+            Mooncake.TestUtils.test_rule(rng, TensorKit.transpose!, C, A, p, One(), Zero(); atol, rtol, mode)
+            Mooncake.TestUtils.test_rule(rng, TensorKit.transpose!, C, A, p, α, β; atol, rtol, mode)
             if !(T <: Real)
-                Mooncake.TestUtils.test_rule(rng, TensorKit.add_transpose!, C, real(A), p, α, β; atol, rtol, mode)
-                Mooncake.TestUtils.test_rule(rng, TensorKit.add_transpose!, C, A, p, real(α), β; atol, rtol, mode)
-                Mooncake.TestUtils.test_rule(rng, TensorKit.add_transpose!, C, real(A), p, real(α), β; atol, rtol, mode)
+                Mooncake.TestUtils.test_rule(rng, TensorKit.transpose!, C, real(A), p, α, β; atol, rtol, mode)
+                Mooncake.TestUtils.test_rule(rng, TensorKit.transpose!, C, A, p, real(α), β; atol, rtol, mode)
+                Mooncake.TestUtils.test_rule(rng, TensorKit.transpose!, C, real(A), p, real(α), β; atol, rtol, mode)
             end
             A = C
         end
     end
 
-    hasbraiding && @timedtestset "add_braid!" begin
+    hasbraiding && @timedtestset "braid!" begin
         A = randn(T, V[1] ⊗ V[2] ← (V[3] ⊗ V[4] ⊗ V[5])')
         α = randn(T)
         β = randn(T)
@@ -62,11 +62,11 @@ eltypes = (Float64, ComplexF64)
             p = randcircshift(numout(A), numin(A))
             levels = Tuple(randperm(numind(A)))
             C = randn!(transpose(A, p))
-            Mooncake.TestUtils.test_rule(rng, TensorKit.add_braid!, C, A, p, levels, α, β; atol, rtol, mode)
+            Mooncake.TestUtils.test_rule(rng, TensorKit.braid!, C, A, p, levels, α, β; atol, rtol, mode)
             if !(T <: Real)
-                Mooncake.TestUtils.test_rule(rng, TensorKit.add_braid!, C, real(A), p, levels, α, β; atol, rtol, mode)
-                Mooncake.TestUtils.test_rule(rng, TensorKit.add_braid!, C, A, p, levels, real(α), β; atol, rtol, mode)
-                Mooncake.TestUtils.test_rule(rng, TensorKit.add_braid!, C, A, p, levels, real(α), real(β); atol, rtol, mode)
+                Mooncake.TestUtils.test_rule(rng, TensorKit.braid!, C, real(A), p, levels, α, β; atol, rtol, mode)
+                Mooncake.TestUtils.test_rule(rng, TensorKit.braid!, C, A, p, levels, real(α), β; atol, rtol, mode)
+                Mooncake.TestUtils.test_rule(rng, TensorKit.braid!, C, A, p, levels, real(α), real(β); atol, rtol, mode)
             end
             A = C
         end

From 4f9bdd0c3eec16a6124a15a97023c2c5945ff17e Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Thu, 23 Apr 2026 10:19:52 -0400
Subject: [PATCH 02/33] Align index manipulation API with TensorOperations
 dispatch convention

Replace single-method-with-variadics (`backend::AbstractBackend...`) with the TensorOperations-style dispatch chain: four separate overloads per in-place function inserting One()/Zero(), DefaultBackend(), and DefaultAllocator() successively so that the full 7-arg form is the implementation endpoint. Move `allocator` from keyword to positional argument throughout the internal chain (`add_transform!`, all kernel functions, `allocate_buffers`). Non-inplace functions (`permute`, `braid`, `transpose`, `repartition`) gain `backend` as a new keyword alongside the existing `allocator` keyword, keeping the user-facing API ergonomic. Docstrings updated to use TO-style bracket notation showing optional arguments.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/tensors/indexmanipulations.jl | 327 +++++++++++++++++-------------
 src/tensors/treetransformers.jl   |   4 +-
 2 files changed, 192 insertions(+), 139 deletions(-)

diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl
index ce998f396..c76fe766f 100644
--- a/src/tensors/indexmanipulations.jl
+++ b/src/tensors/indexmanipulations.jl
@@ -47,40 +47,54 @@ function flip(t::AbstractTensorMap, I; inv::Bool = false)
 end
 
 """
-    permute!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap, (p₁, p₂)::Index2Tuple,
-             α::Number=One(), β::Number=Zero(), backend::AbstractBackend...;
-             allocator=TO.DefaultAllocator())
-        -> tdst
+    permute!(tdst, tsrc, (p₁, p₂)::Index2Tuple[, α=1[, β=0[, backend[, allocator]]]]) -> tdst
 
-Write into `tdst` the result of adding `α * tsrc` to `β * tdst` after permuting the indices of `tsrc`.
+Compute `tdst = β * tdst + α * permute(tsrc, (p₁, p₂))`, writing the result into `tdst`.
 The codomain and domain of `tdst` correspond to the indices in `p₁` and `p₂` of `tsrc` respectively.
+Optionally specify a `backend` and `allocator` for the underlying array operation.
 
-See [`permute`](@ref) for creating a new tensor.
+See also [`permute`](@ref) for creating a new tensor.
 """
+function Base.permute!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple)
+    return permute!(tdst, tsrc, p, One(), Zero())
+end
+function Base.permute!(
+        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple,
+        α::Number, β::Number
+    )
+    return permute!(tdst, tsrc, p, α, β, TO.DefaultBackend())
+end
+function Base.permute!(
+        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple,
+        α::Number, β::Number, backend
+    )
+    return permute!(tdst, tsrc, p, α, β, backend, TO.DefaultAllocator())
+end
 @propagate_inbounds function Base.permute!(
         tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple,
-        α::Number=One(), β::Number=Zero(), backend::AbstractBackend...;
-        allocator=TO.DefaultAllocator()
+        α::Number, β::Number, backend, allocator
     )
     @boundscheck spacecheck_transform(permute, tdst, tsrc, p)
     transformer = treepermuter(tdst, tsrc, p)
-    return @inbounds add_transform!(tdst, tsrc, p, transformer, α, β, backend...; allocator)
+    return @inbounds add_transform!(tdst, tsrc, p, transformer, α, β, backend, allocator)
 end
 
 """
-    permute(tsrc::AbstractTensorMap, (p₁, p₂)::Index2Tuple; copy::Bool = false,
-            allocator=TO.DefaultAllocator()) -> tdst::TensorMap
+    permute(tsrc, (p₁, p₂)::Index2Tuple; copy=false,
+            backend=DefaultBackend(), allocator=DefaultAllocator()) -> tdst::TensorMap
 
 Return tensor `tdst` obtained by permuting the indices of `tsrc`.
 The codomain and domain of `tdst` correspond to the indices in `p₁` and `p₂` of `tsrc` respectively.
 
 If `copy = false`, `tdst` might share data with `tsrc` whenever possible.
 Otherwise, a copy is always made.
+Optionally specify a `backend` and `allocator` for the underlying array operation.
 
-To permute into an existing destination, see [permute!](@ref)
+See also [`permute!`](@ref) for writing into an existing destination.
 """
 function permute(
-        t::AbstractTensorMap, p::Index2Tuple; copy::Bool = false, allocator=TO.DefaultAllocator()
+        t::AbstractTensorMap, p::Index2Tuple;
+        copy::Bool = false, backend=TO.DefaultBackend(), allocator=TO.DefaultAllocator()
     )
     # share data if possible
     if !copy
@@ -93,15 +107,20 @@ function permute(
 
     # general case
     tdst = similar(t, promote_permute(t), permute(space(t), p))
-    return @inbounds permute!(tdst, t, p; allocator)
+    return @inbounds permute!(tdst, t, p, One(), Zero(), backend, allocator)
 end
-function permute(t::AdjointTensorMap, (p₁, p₂)::Index2Tuple; copy::Bool = false, allocator=TO.DefaultAllocator())
+function permute(
+        t::AdjointTensorMap, (p₁, p₂)::Index2Tuple;
+        copy::Bool = false, backend=TO.DefaultBackend(), allocator=TO.DefaultAllocator()
+    )
     p₁′ = adjointtensorindices(t, p₂)
     p₂′ = adjointtensorindices(t, p₁)
-    return adjoint(permute(adjoint(t), (p₁′, p₂′); copy, allocator))
+    return adjoint(permute(adjoint(t), (p₁′, p₂′); copy, backend, allocator))
 end
-permute(t::AbstractTensorMap, p::IndexTuple; copy::Bool = false, allocator=TO.DefaultAllocator()) =
-    permute(t, (p, ()); copy, allocator)
+permute(
+    t::AbstractTensorMap, p::IndexTuple;
+    copy::Bool = false, backend=TO.DefaultBackend(), allocator=TO.DefaultAllocator()
+) = permute(t, (p, ()); copy, backend, allocator)
 
 function has_shared_permute(t::AbstractTensorMap, (p₁, p₂)::Index2Tuple)
     return (p₁ === codomainind(t) && p₂ === domainind(t))
@@ -127,35 +146,47 @@ end
 
 # Braid
 """
-    braid!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap,
-           (p₁, p₂)::Index2Tuple, levels::IndexTuple,
-           α::Number=One(), β::Number=Zero(), backend::AbstractBackend...;
-           allocator=TO.DefaultAllocator())
-        -> tdst
+    braid!(tdst, tsrc, (p₁, p₂)::Index2Tuple, levels::IndexTuple[, α=1[, β=0[, backend[, allocator]]]]) -> tdst
 
-Write into `tdst` the result of adding `α * tsrc` to `β * tdst` after braiding the indices of `tsrc`.
+Compute `tdst = β * tdst + α * braid(tsrc, (p₁, p₂), levels)`, writing the result into `tdst`.
 The codomain and domain of `tdst` correspond to the indices in `p₁` and `p₂` of `tsrc` respectively.
 Here, `levels` is a tuple of length `numind(tsrc)` that assigns a level or height to the indices of `tsrc`,
 which determines whether they will braid over or under any other index with which they have to change places.
+Optionally specify a `backend` and `allocator` for the underlying array operation.
 
-See [`braid`](@ref) for creating a new tensor.
+See also [`braid`](@ref) for creating a new tensor.
 """
+function braid!(
+        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, levels::IndexTuple
+    )
+    return braid!(tdst, tsrc, p, levels, One(), Zero())
+end
+function braid!(
+        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, levels::IndexTuple,
+        α::Number, β::Number
+    )
+    return braid!(tdst, tsrc, p, levels, α, β, TO.DefaultBackend())
+end
+function braid!(
+        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, levels::IndexTuple,
+        α::Number, β::Number, backend
+    )
+    return braid!(tdst, tsrc, p, levels, α, β, backend, TO.DefaultAllocator())
+end
 @propagate_inbounds function braid!(
         tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, levels::IndexTuple,
-        α::Number=One(), β::Number=Zero(), backend::AbstractBackend...;
-        allocator=TO.DefaultAllocator()
+        α::Number, β::Number, backend, allocator
     )
     @boundscheck spacecheck_transform(braid, tdst, tsrc, p, levels)
     levels1 = TupleTools.getindices(levels, codomainind(tsrc))
     levels2 = TupleTools.getindices(levels, domainind(tsrc))
     transformer = treebraider(tdst, tsrc, p, (levels1, levels2))
-    return @inbounds add_transform!(tdst, tsrc, p, transformer, α, β, backend...; allocator)
+    return @inbounds add_transform!(tdst, tsrc, p, transformer, α, β, backend, allocator)
 end
 
 """
-    braid(tsrc::AbstractTensorMap, (p₁, p₂)::Index2Tuple, levels::IndexTuple;
-          copy::Bool = false, allocator=TO.DefaultAllocator())
-        -> tdst::TensorMap
+    braid(tsrc, (p₁, p₂)::Index2Tuple, levels::IndexTuple; copy=false,
+          backend=DefaultBackend(), allocator=DefaultAllocator()) -> tdst::TensorMap
 
 Return tensor `tdst` obtained by braiding the indices of `tsrc`.
 The codomain and domain of `tdst` correspond to the indices in `p₁` and `p₂` of `tsrc` respectively.
@@ -163,21 +194,22 @@ Here, `levels` is a tuple of length `numind(tsrc)` that assigns a level or heigh
 which determines whether they will braid over or under any other index with which they have to change places.
 
 If `copy=false`, `tdst` might share data with `tsrc` whenever possible. Otherwise, a copy is always made.
+Optionally specify a `backend` and `allocator` for the underlying array operation.
 
-To braid into an existing destination, see [braid!](@ref)
+See also [`braid!`](@ref) for writing into an existing destination.
 """
 function braid(
         t::AbstractTensorMap, p::Index2Tuple, levels::IndexTuple;
-        copy::Bool = false, allocator=TO.DefaultAllocator()
+        copy::Bool = false, backend=TO.DefaultBackend(), allocator=TO.DefaultAllocator()
     )
     length(levels) == numind(t) || throw(ArgumentError("invalid levels"))
 
-    BraidingStyle(sectortype(t)) isa SymmetricBraiding && return permute(t, p; copy, allocator)
+    BraidingStyle(sectortype(t)) isa SymmetricBraiding && return permute(t, p; copy, backend, allocator)
     (!copy && p == (codomainind(t), domainind(t))) && return t
 
     # general case
     tdst = similar(t, promote_braid(t), permute(space(t), p))
-    return @inbounds braid!(tdst, t, p, levels; allocator)
+    return @inbounds braid!(tdst, t, p, levels, One(), Zero(), backend, allocator)
 end
 # TODO: braid for `AdjointTensorMap`; think about how to map the `levels` argument.
 
@@ -185,81 +217,108 @@ end
 _transpose_indices(t::AbstractTensorMap) = (reverse(domainind(t)), reverse(codomainind(t)))
 
 """
-    transpose!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap,
-               (p₁, p₂)::Index2Tuple,
-               α::Number=One(), β::Number=Zero(), backend::AbstractBackend...;
-               allocator=TO.DefaultAllocator())
-        -> tdst
+    transpose!(tdst, tsrc, (p₁, p₂)::Index2Tuple[, α=1[, β=0[, backend[, allocator]]]]) -> tdst
 
-Write into `tdst` the result of adding `α * tsrc` to `β * tdst` after transposing the indices of `tsrc`.
+Compute `tdst = β * tdst + α * transpose(tsrc, (p₁, p₂))`, writing the result into `tdst`.
 The codomain and domain of `tdst` correspond to the indices in `p₁` and `p₂` of `tsrc` respectively.
 The new index positions should be attainable without any indices crossing each other, i.e.,
-the permutation `(p₁..., reverse(p₂)...)` should constitute a cyclic permutation of `(codomainind(tsrc)..., reverse(domainind(tsrc))...)`.
+the permutation `(p₁..., reverse(p₂)...)` should constitute a cyclic permutation of
+`(codomainind(tsrc)..., reverse(domainind(tsrc))...)`.
+Optionally specify a `backend` and `allocator` for the underlying array operation.
 
-See [`transpose`](@ref) for creating a new tensor.
+See also [`transpose`](@ref) for creating a new tensor.
 """
+function LinearAlgebra.transpose!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap)
+    return transpose!(tdst, tsrc, _transpose_indices(tsrc))
+end
+function LinearAlgebra.transpose!(
+        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple
+    )
+    return transpose!(tdst, tsrc, p, One(), Zero())
+end
+function LinearAlgebra.transpose!(
+        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple,
+        α::Number, β::Number
+    )
+    return transpose!(tdst, tsrc, p, α, β, TO.DefaultBackend())
+end
+function LinearAlgebra.transpose!(
+        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple,
+        α::Number, β::Number, backend
+    )
+    return transpose!(tdst, tsrc, p, α, β, backend, TO.DefaultAllocator())
+end
 @propagate_inbounds function LinearAlgebra.transpose!(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap,
-        p::Index2Tuple = _transpose_indices(tsrc),
-        α::Number=One(), β::Number=Zero(), backend::AbstractBackend...;
-        allocator=TO.DefaultAllocator()
+        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple,
+        α::Number, β::Number, backend, allocator
     )
     @boundscheck spacecheck_transform(transpose, tdst, tsrc, p)
     transformer = treetransposer(tdst, tsrc, p)
-    return @inbounds add_transform!(tdst, tsrc, p, transformer, α, β, backend...; allocator)
+    return @inbounds add_transform!(tdst, tsrc, p, transformer, α, β, backend, allocator)
 end
 
 """
-    transpose(tsrc::AbstractTensorMap, (p₁, p₂)::Index2Tuple;
-              copy::Bool=false, allocator=TO.DefaultAllocator())
-        -> tdst::TensorMap
+    transpose(tsrc, (p₁, p₂)::Index2Tuple; copy=false,
+              backend=DefaultBackend(), allocator=DefaultAllocator()) -> tdst::TensorMap
 
 Return tensor `tdst` obtained by transposing the indices of `tsrc`.
 The codomain and domain of `tdst` correspond to the indices in `p₁` and `p₂` of `tsrc` respectively.
 The new index positions should be attainable without any indices crossing each other, i.e.,
-the permutation `(p₁..., reverse(p₂)...)` should constitute a cyclic permutation of `(codomainind(tsrc)..., reverse(domainind(tsrc))...)`.
+the permutation `(p₁..., reverse(p₂)...)` should constitute a cyclic permutation of
+`(codomainind(tsrc)..., reverse(domainind(tsrc))...)`.
 
 If `copy=false`, `tdst` might share data with `tsrc` whenever possible. Otherwise, a copy is always made.
+Optionally specify a `backend` and `allocator` for the underlying array operation.
 
-To transpose into an existing destination, see [transpose!](@ref)
+See also [`transpose!`](@ref) for writing into an existing destination.
 """
 function LinearAlgebra.transpose(
         t::AbstractTensorMap, p::Index2Tuple = _transpose_indices(t);
-        copy::Bool = false, allocator=TO.DefaultAllocator()
+        copy::Bool = false, backend=TO.DefaultBackend(), allocator=TO.DefaultAllocator()
     )
-    sectortype(t) === Trivial && return permute(t, p; copy, allocator)
+    sectortype(t) === Trivial && return permute(t, p; copy, backend, allocator)
     (!copy && p == (codomainind(t), domainind(t))) && return t
 
     # general case
     tdst = similar(t, promote_transpose(t), permute(space(t), p))
-    return @inbounds transpose!(tdst, t, p; allocator)
+    return @inbounds transpose!(tdst, t, p, One(), Zero(), backend, allocator)
 end
 
 function LinearAlgebra.transpose(
         t::AdjointTensorMap, (p₁, p₂)::Index2Tuple = _transpose_indices(t);
-        copy::Bool = false, allocator=TO.DefaultAllocator()
+        copy::Bool = false, backend=TO.DefaultBackend(), allocator=TO.DefaultAllocator()
     )
     p₁′ = map(n -> adjointtensorindex(t, n), p₂)
     p₂′ = map(n -> adjointtensorindex(t, n), p₁)
-    return adjoint(transpose(adjoint(t), (p₁′, p₂′); copy, allocator))
+    return adjoint(transpose(adjoint(t), (p₁′, p₂′); copy, backend, allocator))
 end
 
 """
-    repartition!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap,
-                 α::Number=One(), β::Number=Zero(), backend::AbstractBackend...;
-                 allocator=TO.DefaultAllocator())
-        -> tdst
+    repartition!(tdst, tsrc[, α=1[, β=0[, backend[, allocator]]]]) -> tdst
 
-Write into `tdst` the result of adding `α * tsrc` to `β * tdst` after repartitioning the indices of
-`tsrc`. This is just a special case of a transposition that only changes the number of in- and
-outgoing indices.
+Compute `tdst = β * tdst + α * repartition(tsrc)`, writing the result into `tdst`.
+This is a special case of `transpose!` that only changes the partition of indices between
+codomain and domain, without changing their cyclic order.
+Optionally specify a `backend` and `allocator` for the underlying array operation.
 
-See [`repartition`](@ref) for creating a new tensor.
+See also [`repartition`](@ref) for creating a new tensor.
 """
+function repartition!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap)
+    return repartition!(tdst, tsrc, One(), Zero())
+end
+function repartition!(
+        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, α::Number, β::Number
+    )
+    return repartition!(tdst, tsrc, α, β, TO.DefaultBackend())
+end
+function repartition!(
+        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, α::Number, β::Number, backend
+    )
+    return repartition!(tdst, tsrc, α, β, backend, TO.DefaultAllocator())
+end
 @propagate_inbounds function repartition!(
         tdst::AbstractTensorMap, tsrc::AbstractTensorMap,
-        α::Number=One(), β::Number=Zero(), backend::AbstractBackend...;
-        allocator=TO.DefaultAllocator()
+        α::Number, β::Number, backend, allocator
     )
     check_spacetype(tdst, tsrc)
     numind(tsrc) == numind(tdst) ||
@@ -267,32 +326,32 @@ See [`repartition`](@ref) for creating a new tensor.
     all_inds = (codomainind(tsrc)..., reverse(domainind(tsrc))...)
     p₁ = ntuple(i -> all_inds[i], numout(tdst))
     p₂ = reverse(ntuple(i -> all_inds[i + numout(tdst)], numin(tdst)))
-    return transpose!(tdst, tsrc, (p₁, p₂), α, β, backend...; allocator)
+    return transpose!(tdst, tsrc, (p₁, p₂), α, β, backend, allocator)
 end
 
 """
-    repartition(
-        tsrc::AbstractTensorMap{T, S}, N₁::Int, N₂::Int; copy::Bool=false,
-        allocator=TO.DefaultAllocator()
-    ) where {T, S} -> tdst::AbstractTensorMap{T, S, N₁, N₂}
+    repartition(tsrc, N₁::Int, N₂::Int=numind(tsrc)-N₁; copy=false,
+                backend=DefaultBackend(), allocator=DefaultAllocator()) -> tdst
 
-Return tensor `tdst` obtained by repartitioning the indices of `t`.
-The codomain and domain of `tdst` correspond to the first `N₁` and last `N₂` spaces of `t`, respectively.
+Return tensor `tdst` obtained by repartitioning the indices of `tsrc`.
+The codomain and domain of `tdst` correspond to the first `N₁` and last `N₂` spaces of `tsrc`,
+respectively.
 
 If `copy=false`, `tdst` might share data with `tsrc` whenever possible. Otherwise, a copy is always made.
+Optionally specify a `backend` and `allocator` for the underlying array operation.
 
-To repartition into an existing destination, see [repartition!](@ref).
+See also [`repartition!`](@ref) for writing into an existing destination.
 """
 @constprop :aggressive function repartition(
         t::AbstractTensorMap, N₁::Int, N₂::Int = numind(t) - N₁;
-        copy::Bool = false, allocator=TO.DefaultAllocator()
+        copy::Bool = false, backend=TO.DefaultBackend(), allocator=TO.DefaultAllocator()
     )
     N₁ + N₂ == numind(t) ||
         throw(ArgumentError("Invalid repartition: $(numind(t)) to ($N₁, $N₂)"))
     all_inds = (codomainind(t)..., reverse(domainind(t))...)
     p₁ = ntuple(i -> all_inds[i], N₁)
     p₂ = reverse(ntuple(i -> all_inds[i + N₁], N₂))
-    return transpose(t, (p₁, p₂); copy, allocator)
+    return transpose(t, (p₁, p₂); copy, backend, allocator)
 end
 
 # Twist
@@ -486,10 +545,10 @@ end
 # Deprecated add_*! wrappers
 # --------------------------
 """
-    add_permute!(tdst, tsrc, (p₁, p₂)::Index2Tuple, α::Number, β::Number, backend::AbstractBackend...)
+    add_permute!(tdst, tsrc, (p₁, p₂)::Index2Tuple, α::Number, β::Number[, backend])
 
 !!! warning "Deprecated"
-    `add_permute!` is deprecated. Use `permute!(tdst, tsrc, p, α, β, backend...)` instead.
+    `add_permute!` is deprecated. Use `permute!(tdst, tsrc, p, α, β[, backend])` instead.
 """
 function add_permute!(
         tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple,
@@ -500,11 +559,10 @@ function add_permute!(
 end
 
 """
-    add_braid!(tdst, tsrc, (p₁, p₂)::Index2Tuple, levels::IndexTuple, α::Number, β::Number,
-               backend::AbstractBackend...)
+    add_braid!(tdst, tsrc, (p₁, p₂)::Index2Tuple, levels::IndexTuple, α::Number, β::Number[, backend])
 
 !!! warning "Deprecated"
-    `add_braid!` is deprecated. Use `braid!(tdst, tsrc, p, levels, α, β, backend...)` instead.
+    `add_braid!` is deprecated. Use `braid!(tdst, tsrc, p, levels, α, β[, backend])` instead.
 """
 function add_braid!(
         tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, levels::IndexTuple,
@@ -515,11 +573,10 @@ function add_braid!(
 end
 
 """
-    add_transpose!(tdst, tsrc, (p₁, p₂)::Index2Tuple, α::Number, β::Number,
-                   backend::AbstractBackend...)
+    add_transpose!(tdst, tsrc, (p₁, p₂)::Index2Tuple, α::Number, β::Number[, backend])
 
 !!! warning "Deprecated"
-    `add_transpose!` is deprecated. Use `transpose!(tdst, tsrc, p, α, β, backend...)` instead.
+    `add_transpose!` is deprecated. Use `transpose!(tdst, tsrc, p, α, β[, backend])` instead.
 """
 function add_transpose!(
         tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple,
@@ -531,8 +588,7 @@ end
 
 @propagate_inbounds function add_transform!(
         tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, transformer,
-        α::Number, β::Number, backend::AbstractBackend...;
-        allocator=TO.DefaultAllocator()
+        α::Number, β::Number, backend, allocator
     )
     @boundscheck spacecheck_transform(permute, tdst, tsrc, p)
 
@@ -540,15 +596,14 @@ end
         add!(tdst, tsrc, α, β)
     else
         I = sectortype(tdst)
-        _backend = isempty(backend) ? TO.DefaultBackend() : only(backend)
         if I === Trivial
-            add_trivial_kernel!(tdst, tsrc, p, transformer, α, β, _backend; allocator)
+            add_trivial_kernel!(tdst, tsrc, p, transformer, α, β, backend, allocator)
         else
             style = FusionStyle(I)
             if use_threaded_transform(tdst, transformer)
-                add_kernel_threaded!(style, tdst, tsrc, p, transformer, α, β, _backend; allocator)
+                add_kernel_threaded!(style, tdst, tsrc, p, transformer, α, β, backend, allocator)
             else
-                add_kernel_nonthreaded!(style, tdst, tsrc, p, transformer, α, β, _backend; allocator)
+                add_kernel_nonthreaded!(style, tdst, tsrc, p, transformer, α, β, backend, allocator)
             end
         end
     end
@@ -565,7 +620,7 @@ end
 
 # Trivial implementations
 # -----------------------
-function add_trivial_kernel!(tdst, tsrc, p, transformer, α, β, backend; allocator=TO.DefaultAllocator())
+function add_trivial_kernel!(tdst, tsrc, p, transformer, α, β, backend, allocator)
     TO.tensoradd!(tdst[], tsrc[], p, false, α, β, backend, allocator)
     return nothing
 end
@@ -573,67 +628,67 @@ end
 # Non-threaded implementations
 # ----------------------------
 function add_kernel_nonthreaded!(
-        ::UniqueFusion, tdst, tsrc, p, transformer, α, β, backend; allocator=TO.DefaultAllocator()
+        ::UniqueFusion, tdst, tsrc, p, transformer, α, β, backend, allocator
     )
     for (f₁, f₂) in fusiontrees(tsrc)
-        _add_transform_single!(tdst, tsrc, p, (f₁, f₂), transformer, α, β, backend; allocator)
+        _add_transform_single!(tdst, tsrc, p, (f₁, f₂), transformer, α, β, backend, allocator)
     end
     return nothing
 end
 function add_kernel_nonthreaded!(
-        ::UniqueFusion, tdst, tsrc, p, transformer::AbelianTreeTransformer, α, β, backend;
-        allocator=TO.DefaultAllocator()
+        ::UniqueFusion, tdst, tsrc, p, transformer::AbelianTreeTransformer, α, β, backend,
+        allocator
     )
     for subtransformer in transformer.data
-        _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend; allocator)
+        _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend, allocator)
     end
     return nothing
 end
 function add_kernel_nonthreaded!(
-        ::FusionStyle, tdst, tsrc, p, transformer, α, β, backend; allocator=TO.DefaultAllocator()
+        ::FusionStyle, tdst, tsrc, p, transformer, α, β, backend, allocator
     )
     # preallocate buffers
-    buffers = allocate_buffers(tdst, tsrc, transformer; allocator)
+    buffers = allocate_buffers(tdst, tsrc, transformer, allocator)
 
     for src in fusionblocks(tsrc)
         if length(src) == 1
-            _add_transform_single!(tdst, tsrc, p, src, transformer, α, β, backend; allocator)
+            _add_transform_single!(tdst, tsrc, p, src, transformer, α, β, backend, allocator)
         else
-            _add_transform_multi!(tdst, tsrc, p, src, transformer, buffers, α, β, backend; allocator)
+            _add_transform_multi!(tdst, tsrc, p, src, transformer, buffers, α, β, backend, allocator)
         end
     end
     return nothing
 end
 # specialization in the case of TensorMap
 function add_kernel_nonthreaded!(
-        ::FusionStyle, tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend;
-        allocator=TO.DefaultAllocator()
+        ::FusionStyle, tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend,
+        allocator
     )
     # preallocate buffers
-    buffers = allocate_buffers(tdst, tsrc, transformer; allocator)
+    buffers = allocate_buffers(tdst, tsrc, transformer, allocator)
 
     for subtransformer in transformer.data
         # Special case without intermediate buffers whenever there is only a single block
         if length(subtransformer[1]) == 1
-            _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend; allocator)
+            _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend, allocator)
         else
-            _add_transform_multi!(tdst, tsrc, p, subtransformer, buffers, α, β, backend; allocator)
+            _add_transform_multi!(tdst, tsrc, p, subtransformer, buffers, α, β, backend, allocator)
         end
     end
     return nothing
 end
 # ambiguity resolution
 function add_kernel_nonthreaded!(
-        ::UniqueFusion, tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend;
-        allocator=TO.DefaultAllocator()
+        ::UniqueFusion, tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend,
+        allocator
     )
     throw(ArgumentError("Cannot combine `GenericTreeTransformer` with `UniqueFusion`"))
 end
 # Threaded implementations
 # ------------------------
 function add_kernel_threaded!(
-        ::UniqueFusion, tdst, tsrc, p, transformer, α, β, backend;
-        ntasks::Int = get_num_transformer_threads(), allocator=TO.DefaultAllocator()
+        ::UniqueFusion, tdst, tsrc, p, transformer, α, β, backend, allocator;
+        ntasks::Int = get_num_transformer_threads()
     )
     trees = fusiontrees(tsrc)
     nblocks = length(trees)
@@ -644,15 +699,15 @@ function add_kernel_threaded!(
                 local_counter = Threads.atomic_add!(counter, 1)
                 local_counter > nblocks && break
                 @inbounds (f₁, f₂) = trees[local_counter]
-                _add_transform_single!(tdst, tsrc, p, (f₁, f₂), transformer, α, β, backend; allocator)
+                _add_transform_single!(tdst, tsrc, p, (f₁, f₂), transformer, α, β, backend, allocator)
             end
         end
     end
     return nothing
 end
 function add_kernel_threaded!(
-        ::UniqueFusion, tdst, tsrc, p, transformer::AbelianTreeTransformer, α, β, backend;
-        ntasks::Int = get_num_transformer_threads(), allocator=TO.DefaultAllocator()
+        ::UniqueFusion, tdst, tsrc, p, transformer::AbelianTreeTransformer, α, β, backend,
+        allocator; ntasks::Int = get_num_transformer_threads()
     )
     nblocks = length(transformer.data)
     counter = Threads.Atomic{Int}(1)
@@ -662,7 +717,7 @@ function add_kernel_threaded!(
                 local_counter = Threads.atomic_add!(counter, 1)
                 local_counter > nblocks && break
                 @inbounds subtransformer = transformer.data[local_counter]
-                _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend; allocator)
+                _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend, allocator)
             end
         end
     end
@@ -670,8 +725,8 @@ function add_kernel_threaded!(
 end
 
 function add_kernel_threaded!(
-        ::FusionStyle, tdst, tsrc, p, transformer, α, β, backend;
-        ntasks::Int = get_num_transformer_threads(), allocator=TO.DefaultAllocator()
+        ::FusionStyle, tdst, tsrc, p, transformer, α, β, backend, allocator;
+        ntasks::Int = get_num_transformer_threads()
     )
     allblocks = fusionblocks(tsrc)
     nblocks = length(allblocks)
@@ -680,16 +735,16 @@ function add_kernel_threaded!(
     Threads.@sync for _ in 1:min(ntasks, nblocks)
         Threads.@spawn begin
             # preallocate buffers for each task
-            buffers = allocate_buffers(tdst, tsrc, transformer; allocator)
+            buffers = allocate_buffers(tdst, tsrc, transformer, allocator)
 
             while true
                 local_counter = Threads.atomic_add!(counter, 1)
                 local_counter > nblocks && break
                 @inbounds src = allblocks[local_counter]
                 if length(src) == 1
-                    _add_transform_single!(tdst, tsrc, p, src, transformer, α, β, backend; allocator)
+                    _add_transform_single!(tdst, tsrc, p, src, transformer, α, β, backend, allocator)
                 else
-                    _add_transform_multi!(tdst, tsrc, p, src, transformer, buffers, α, β, backend; allocator)
+                    _add_transform_multi!(tdst, tsrc, p, src, transformer, buffers, α, β, backend, allocator)
                 end
             end
         end
@@ -699,8 +754,8 @@ function add_kernel_threaded!(
 end
 # specialization in the case of TensorMap
 function add_kernel_threaded!(
-        ::FusionStyle, tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend;
-        ntasks::Int = get_num_transformer_threads(), allocator=TO.DefaultAllocator()
+        ::FusionStyle, tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend,
+        allocator; ntasks::Int = get_num_transformer_threads()
     )
     nblocks = length(transformer.data)
 
@@ -708,16 +763,16 @@ function add_kernel_threaded!(
     Threads.@sync for _ in 1:min(ntasks, nblocks)
         Threads.@spawn begin
             # preallocate buffers for each task
-            buffers = allocate_buffers(tdst, tsrc, transformer; allocator)
+            buffers = allocate_buffers(tdst, tsrc, transformer, allocator)
 
             while true
                 local_counter = Threads.atomic_add!(counter, 1)
                 local_counter > nblocks && break
                 @inbounds subtransformer = transformer.data[local_counter]
                 if length(subtransformer[1]) == 1
-                    _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend; allocator)
+                    _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend, allocator)
                 else
-                    _add_transform_multi!(tdst, tsrc, p, subtransformer, buffers, α, β, backend; allocator)
+                    _add_transform_multi!(tdst, tsrc, p, subtransformer, buffers, α, β, backend, allocator)
                 end
             end
         end
@@ -727,8 +782,8 @@ function add_kernel_threaded!(
 end
 # ambiguity resolution
 function add_kernel_threaded!(
-        ::UniqueFusion, tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend;
-        ntasks::Int = get_num_transformer_threads(), allocator=TO.DefaultAllocator()
+        ::UniqueFusion, tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend,
+        allocator; ntasks::Int = get_num_transformer_threads()
     )
     throw(ArgumentError("Cannot combine `GenericTreeTransformer` with `UniqueFusion`"))
 end
@@ -737,16 +792,14 @@ end
 # Auxiliary methods
 # -----------------
 function _add_transform_single!(
-        tdst, tsrc, p, (f₁, f₂)::FusionTreePair, transformer, α, β, backend;
-        allocator=TO.DefaultAllocator()
+        tdst, tsrc, p, (f₁, f₂)::FusionTreePair, transformer, α, β, backend, allocator
     )
     (f₁′, f₂′), coeff = transformer((f₁, f₂))
     @inbounds TO.tensoradd!(tdst[f₁′, f₂′], tsrc[f₁, f₂], p, false, α * coeff, β, backend, allocator)
     return nothing
 end
 function _add_transform_single!(
-        tdst, tsrc, p, src::FusionTreeBlock, transformer, α, β, backend;
-        allocator=TO.DefaultAllocator()
+        tdst, tsrc, p, src::FusionTreeBlock, transformer, α, β, backend, allocator
     )
     dst, U = transformer(src)
     f₁, f₂ = only(fusiontrees(src))
@@ -757,7 +810,7 @@ function _add_transform_single!(
 end
 function _add_transform_single!(
         tdst, tsrc, p, (coeff, struct_dst, struct_src)::AbelianTransformerData,
-        α, β, backend; allocator=TO.DefaultAllocator()
+        α, β, backend, allocator
     )
     subblock_dst = StridedView(tdst.data, struct_dst...)
     subblock_src = StridedView(tsrc.data, struct_src...)
@@ -766,18 +819,18 @@ function _add_transform_single!(
 end
 function _add_transform_single!(
         tdst, tsrc, p, (basistransform, structs_dst, structs_src)::GenericTransformerData,
-        α, β, backend; allocator=TO.DefaultAllocator()
+        α, β, backend, allocator
     )
     struct_dst = (structs_dst[1], only(structs_dst[2])...)
     struct_src = (structs_src[1], only(structs_src[2])...)
     coeff = only(basistransform)
-    _add_transform_single!(tdst, tsrc, p, (coeff, struct_dst, struct_src), α, β, backend; allocator)
+    _add_transform_single!(tdst, tsrc, p, (coeff, struct_dst, struct_src), α, β, backend, allocator)
     return nothing
 end
 
 function _add_transform_multi!(
-        tdst, tsrc, p, src::FusionTreeBlock, transformer, (buffer1, buffer2), α, β, backend;
-        allocator=TO.DefaultAllocator()
+        tdst, tsrc, p, src::FusionTreeBlock, transformer, (buffer1, buffer2), α, β, backend,
+        allocator
     )
     dst, U = transformer(src)
     rows, cols = size(U)
@@ -811,7 +864,7 @@ function _add_transform_multi!(
 end
 function _add_transform_multi!(
         tdst, tsrc, p, (U, (sz_dst, structs_dst), (sz_src, structs_src)),
-        (buffer1, buffer2), α, β, backend; allocator=TO.DefaultAllocator()
+        (buffer1, buffer2), α, β, backend, allocator
     )
     rows, cols = size(U)
     blocksize = prod(sz_src)
diff --git a/src/tensors/treetransformers.jl b/src/tensors/treetransformers.jl
index d796754c3..8c63b00a8 100644
--- a/src/tensors/treetransformers.jl
+++ b/src/tensors/treetransformers.jl
@@ -135,14 +135,14 @@ function buffersize(transformer::GenericTreeTransformer)
 end
 
 function allocate_buffers(
-        tdst::TensorMap, tsrc::TensorMap, transformer::GenericTreeTransformer;
+        tdst::TensorMap, tsrc::TensorMap, transformer::GenericTreeTransformer,
         allocator=TO.DefaultAllocator()
     )
     sz = buffersize(transformer)
     return similar(tdst.data, sz), similar(tsrc.data, sz)
 end
 function allocate_buffers(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, transformer;
+        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, transformer,
         allocator=TO.DefaultAllocator()
     )
     # be pessimistic and assume the worst for now

From f6cd201cdf0686d0a0d8590bc13e292ddb2bc9ee Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Thu, 23 Apr 2026 11:02:28 -0400
Subject: [PATCH 03/33] simplify implementation

---
 src/TensorKit.jl                  |   3 +-
 src/tensors/indexmanipulations.jl | 121 ++++++++----------------------
 2 files changed, 34 insertions(+), 90 deletions(-)

diff --git a/src/TensorKit.jl b/src/TensorKit.jl
index d8361ac79..db3d9c50f 100644
--- a/src/TensorKit.jl
+++ b/src/TensorKit.jl
@@ -91,8 +91,7 @@ export left_orth, right_orth, left_null, right_null,
     isisometric, isunitary, project_isometric, project_isometric!,
     isposdef, isposdef!, sylvester, rank, cond
 
-export braid, braid!, permute, permute!, transpose, transpose!, twist, twist!, repartition,
-    repartition!
+export braid, braid!, permute, permute!, transpose, transpose!, twist, twist!, repartition, repartition!
 export catdomain, catcodomain, absorb, absorb!
 
 # tensor operations
diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl
index c76fe766f..f9eb1b4da 100644
--- a/src/tensors/indexmanipulations.jl
+++ b/src/tensors/indexmanipulations.jl
@@ -46,8 +46,11 @@ function flip(t::AbstractTensorMap, I; inv::Bool = false)
     return t′
 end
 
+# --------------
+#   permute(!)
+# --------------
 """
-    permute!(tdst, tsrc, (p₁, p₂)::Index2Tuple[, α=1[, β=0[, backend[, allocator]]]]) -> tdst
+    permute!(tdst, tsrc, (p₁, p₂)::Index2Tuple, α = 1, β = 0, [backend], [allocator]) -> tdst
 
 Compute `tdst = β * tdst + α * permute(tsrc, (p₁, p₂))`, writing the result into `tdst`.
 The codomain and domain of `tdst` correspond to the indices in `p₁` and `p₂` of `tsrc` respectively.
@@ -55,24 +58,10 @@ Optionally specify a `backend` and `allocator` for the underlying array operatio
 
 See also [`permute`](@ref) for creating a new tensor.
 """
-function Base.permute!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple)
-    return permute!(tdst, tsrc, p, One(), Zero())
-end
-function Base.permute!(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple,
-        α::Number, β::Number
-    )
-    return permute!(tdst, tsrc, p, α, β, TO.DefaultBackend())
-end
-function Base.permute!(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple,
-        α::Number, β::Number, backend
-    )
-    return permute!(tdst, tsrc, p, α, β, backend, TO.DefaultAllocator())
-end
 @propagate_inbounds function Base.permute!(
         tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple,
-        α::Number, β::Number, backend, allocator
+        α::Number = One(), β::Number = Zero(),
+        backend::AbstractBackend = TO.DefaultBackend(), allocator = TO.DefaultAllocator()
     )
     @boundscheck spacecheck_transform(permute, tdst, tsrc, p)
     transformer = treepermuter(tdst, tsrc, p)
@@ -80,8 +69,7 @@ end
 end
 
 """
-    permute(tsrc, (p₁, p₂)::Index2Tuple; copy=false,
-            backend=DefaultBackend(), allocator=DefaultAllocator()) -> tdst::TensorMap
+    permute(tsrc, (p₁, p₂)::Index2Tuple; copy = false, [backend], [allocator]) -> tdst
 
 Return tensor `tdst` obtained by permuting the indices of `tsrc`.
 The codomain and domain of `tdst` correspond to the indices in `p₁` and `p₂` of `tsrc` respectively.
@@ -94,7 +82,7 @@ See also [`permute!`](@ref) for writing into an existing destination.
 """
 function permute(
         t::AbstractTensorMap, p::Index2Tuple;
-        copy::Bool = false, backend=TO.DefaultBackend(), allocator=TO.DefaultAllocator()
+        copy::Bool = false, backend::AbstractBackend = TO.DefaultBackend(), allocator = TO.DefaultAllocator()
     )
     # share data if possible
     if !copy
@@ -109,18 +97,12 @@ function permute(
     tdst = similar(t, promote_permute(t), permute(space(t), p))
     return @inbounds permute!(tdst, t, p, One(), Zero(), backend, allocator)
 end
-function permute(
-        t::AdjointTensorMap, (p₁, p₂)::Index2Tuple;
-        copy::Bool = false, backend=TO.DefaultBackend(), allocator=TO.DefaultAllocator()
-    )
+function permute(t::AdjointTensorMap, (p₁, p₂)::Index2Tuple; kwargs...)
     p₁′ = adjointtensorindices(t, p₂)
     p₂′ = adjointtensorindices(t, p₁)
-    return adjoint(permute(adjoint(t), (p₁′, p₂′); copy, backend, allocator))
+    return adjoint(permute(adjoint(t), (p₁′, p₂′); kwargs...))
 end
-permute(
-    t::AbstractTensorMap, p::IndexTuple;
-    copy::Bool = false, backend=TO.DefaultBackend(), allocator=TO.DefaultAllocator()
-) = permute(t, (p, ()); copy, backend, allocator)
+permute(t::AbstractTensorMap, p::IndexTuple; kwargs...) = permute(t, (p, ()); kwargs...)
 
 function has_shared_permute(t::AbstractTensorMap, (p₁, p₂)::Index2Tuple)
     return (p₁ === codomainind(t) && p₂ === domainind(t))
@@ -144,9 +126,11 @@ function has_shared_permute(t::AdjointTensorMap, (p₁, p₂)::Index2Tuple)
     return has_shared_permute(t', (p₁′, p₂′))
 end
 
-# Braid
+# -------------
+#   braid(!)
+# -------------
 """
-    braid!(tdst, tsrc, (p₁, p₂)::Index2Tuple, levels::IndexTuple[, α=1[, β=0[, backend[, allocator]]]]) -> tdst
+    braid!(tdst, tsrc, (p₁, p₂)::Index2Tuple, levels::IndexTuple, α = 1, β = 0, [backend], [allocator]) -> tdst
 
 Compute `tdst = β * tdst + α * braid(tsrc, (p₁, p₂), levels)`, writing the result into `tdst`.
 The codomain and domain of `tdst` correspond to the indices in `p₁` and `p₂` of `tsrc` respectively.
@@ -156,26 +140,10 @@ Optionally specify a `backend` and `allocator` for the underlying array operatio
 
 See also [`braid`](@ref) for creating a new tensor.
 """
-function braid!(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, levels::IndexTuple
-    )
-    return braid!(tdst, tsrc, p, levels, One(), Zero())
-end
-function braid!(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, levels::IndexTuple,
-        α::Number, β::Number
-    )
-    return braid!(tdst, tsrc, p, levels, α, β, TO.DefaultBackend())
-end
-function braid!(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, levels::IndexTuple,
-        α::Number, β::Number, backend
-    )
-    return braid!(tdst, tsrc, p, levels, α, β, backend, TO.DefaultAllocator())
-end
 @propagate_inbounds function braid!(
         tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, levels::IndexTuple,
-        α::Number, β::Number, backend, allocator
+        α::Number = One(), β::Number = Zero(),
+        backend::AbstractBackend = TO.DefaultBackend(), allocator = TO.DefaultAllocator()
     )
     @boundscheck spacecheck_transform(braid, tdst, tsrc, p, levels)
     levels1 = TupleTools.getindices(levels, codomainind(tsrc))
@@ -200,7 +168,7 @@ See also [`braid!`](@ref) for writing into an existing destination.
 """
 function braid(
         t::AbstractTensorMap, p::Index2Tuple, levels::IndexTuple;
-        copy::Bool = false, backend=TO.DefaultBackend(), allocator=TO.DefaultAllocator()
+        copy::Bool = false, backend::AbstractBackend = TO.DefaultBackend(), allocator = TO.DefaultAllocator()
     )
     length(levels) == numind(t) || throw(ArgumentError("invalid levels"))
 
@@ -213,11 +181,13 @@ function braid(
 end
 # TODO: braid for `AdjointTensorMap`; think about how to map the `levels` argument.
 
-# Transpose
+# ----------------
+#   transpose(!)
+# ----------------
 _transpose_indices(t::AbstractTensorMap) = (reverse(domainind(t)), reverse(codomainind(t)))
 
 """
-    transpose!(tdst, tsrc, (p₁, p₂)::Index2Tuple[, α=1[, β=0[, backend[, allocator]]]]) -> tdst
+    transpose!(tdst, tsrc, (p₁, p₂)::Index2Tuple, α = 1, β = 0, [backend], [allocator]) -> tdst
 
 Compute `tdst = β * tdst + α * transpose(tsrc, (p₁, p₂))`, writing the result into `tdst`.
 The codomain and domain of `tdst` correspond to the indices in `p₁` and `p₂` of `tsrc` respectively.
@@ -231,26 +201,10 @@ See also [`transpose`](@ref) for creating a new tensor.
 function LinearAlgebra.transpose!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap)
     return transpose!(tdst, tsrc, _transpose_indices(tsrc))
 end
-function LinearAlgebra.transpose!(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple
-    )
-    return transpose!(tdst, tsrc, p, One(), Zero())
-end
-function LinearAlgebra.transpose!(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple,
-        α::Number, β::Number
-    )
-    return transpose!(tdst, tsrc, p, α, β, TO.DefaultBackend())
-end
-function LinearAlgebra.transpose!(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple,
-        α::Number, β::Number, backend
-    )
-    return transpose!(tdst, tsrc, p, α, β, backend, TO.DefaultAllocator())
-end
 @propagate_inbounds function LinearAlgebra.transpose!(
         tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple,
-        α::Number, β::Number, backend, allocator
+        α::Number = One(), β::Number = Zero(),
+        backend::AbstractBackend = TO.DefaultBackend(), allocator = TO.DefaultAllocator()
     )
     @boundscheck spacecheck_transform(transpose, tdst, tsrc, p)
     transformer = treetransposer(tdst, tsrc, p)
@@ -274,7 +228,7 @@ See also [`transpose!`](@ref) for writing into an existing destination.
 """
 function LinearAlgebra.transpose(
         t::AbstractTensorMap, p::Index2Tuple = _transpose_indices(t);
-        copy::Bool = false, backend=TO.DefaultBackend(), allocator=TO.DefaultAllocator()
+        copy::Bool = false, backend = TO.DefaultBackend(), allocator = TO.DefaultAllocator()
     )
     sectortype(t) === Trivial && return permute(t, p; copy, backend, allocator)
     (!copy && p == (codomainind(t), domainind(t))) && return t
@@ -286,15 +240,18 @@ end
 
 function LinearAlgebra.transpose(
         t::AdjointTensorMap, (p₁, p₂)::Index2Tuple = _transpose_indices(t);
-        copy::Bool = false, backend=TO.DefaultBackend(), allocator=TO.DefaultAllocator()
+        copy::Bool = false, backend = TO.DefaultBackend(), allocator = TO.DefaultAllocator()
     )
     p₁′ = map(n -> adjointtensorindex(t, n), p₂)
     p₂′ = map(n -> adjointtensorindex(t, n), p₁)
     return adjoint(transpose(adjoint(t), (p₁′, p₂′); copy, backend, allocator))
 end
 
+# -------------------
+#   repartition(!)
+# -------------------
 """
-    repartition!(tdst, tsrc[, α=1[, β=0[, backend[, allocator]]]]) -> tdst
+    repartition!(tdst, tsrc, α = 1, β = 0, [backend], [allocator]) -> tdst
 
 Compute `tdst = β * tdst + α * repartition(tsrc)`, writing the result into `tdst`.
 This is a special case of `transpose!` that only changes the partition of indices between
@@ -303,22 +260,10 @@ Optionally specify a `backend` and `allocator` for the underlying array operatio
 
 See also [`repartition`](@ref) for creating a new tensor.
 """
-function repartition!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap)
-    return repartition!(tdst, tsrc, One(), Zero())
-end
-function repartition!(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, α::Number, β::Number
-    )
-    return repartition!(tdst, tsrc, α, β, TO.DefaultBackend())
-end
-function repartition!(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, α::Number, β::Number, backend
-    )
-    return repartition!(tdst, tsrc, α, β, backend, TO.DefaultAllocator())
-end
 @propagate_inbounds function repartition!(
         tdst::AbstractTensorMap, tsrc::AbstractTensorMap,
-        α::Number, β::Number, backend, allocator
+        α::Number = One(), β::Number = Zero(),
+        backend::AbstractBackend = TO.DefaultBackend(), allocator = TO.DefaultAllocator()
     )
     check_spacetype(tdst, tsrc)
     numind(tsrc) == numind(tdst) ||
@@ -344,7 +289,7 @@ See also [`repartition!`](@ref) for writing into an existing destination.
 """
 @constprop :aggressive function repartition(
         t::AbstractTensorMap, N₁::Int, N₂::Int = numind(t) - N₁;
-        copy::Bool = false, backend=TO.DefaultBackend(), allocator=TO.DefaultAllocator()
+        copy::Bool = false, backend = TO.DefaultBackend(), allocator = TO.DefaultAllocator()
     )
     N₁ + N₂ == numind(t) ||
         throw(ArgumentError("Invalid repartition: $(numind(t)) to ($N₁, $N₂)"))

From 116d6c8724fc1cb01de0f8c959e76d04128e2b94 Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Thu, 23 Apr 2026 11:59:11 -0400
Subject: [PATCH 04/33] minor code improvements

---
 src/tensors/indexmanipulations.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl
index f9eb1b4da..f852f156b 100644
--- a/src/tensors/indexmanipulations.jl
+++ b/src/tensors/indexmanipulations.jl
@@ -111,8 +111,8 @@ function has_shared_permute(t::TensorMap, (p₁, p₂)::Index2Tuple)
     if p₁ === codomainind(t) && p₂ === domainind(t)
         return true
     elseif sectortype(t) === Trivial
-        stridet = i -> stride(t[], i)
-        sizet = i -> size(t[], i)
+        stridet = Base.Fix1(stride, t[])
+        sizet = Base.Fix1(size, t[])
         canfuse1, d1, s1 = TO._canfuse(sizet.(p₁), stridet.(p₁))
         canfuse2, d2, s2 = TO._canfuse(sizet.(p₂), stridet.(p₂))
         return canfuse1 && canfuse2 && s1 == 1 && (d2 == 1 || s2 == d1)
@@ -170,7 +170,7 @@ function braid(
         t::AbstractTensorMap, p::Index2Tuple, levels::IndexTuple;
         copy::Bool = false, backend::AbstractBackend = TO.DefaultBackend(), allocator = TO.DefaultAllocator()
     )
-    length(levels) == numind(t) || throw(ArgumentError("invalid levels"))
+    length(levels) == numind(t) || throw(ArgumentError(lazy"length of levels should be $(numind(t)), got $(length(levels))"))
 
     BraidingStyle(sectortype(t)) isa SymmetricBraiding && return permute(t, p; copy, backend, allocator)
     (!copy && p == (codomainind(t), domainind(t))) && return t

From 52296d87c2cd93782d6e37447c9ac8b551e06bed Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Fri, 24 Apr 2026 09:20:53 -0400
Subject: [PATCH 05/33] add braid codepath for adjoint tensors

---
 src/tensors/indexmanipulations.jl  | 11 ++++++++++-
 test/tensors/indexmanipulations.jl |  8 ++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl
index f852f156b..21af7ce93 100644
--- a/src/tensors/indexmanipulations.jl
+++ b/src/tensors/indexmanipulations.jl
@@ -179,7 +179,16 @@ function braid(
     tdst = similar(t, promote_braid(t), permute(space(t), p))
     return @inbounds braid!(tdst, t, p, levels, One(), Zero(), backend, allocator)
 end
-# TODO: braid for `AdjointTensorMap`; think about how to map the `levels` argument.
+function braid(
+        t::AdjointTensorMap, (p₁, p₂)::Index2Tuple, levels::IndexTuple;
+        kwargs...
+    )
+    p₁′ = adjointtensorindices(t, p₂)
+    p₂′ = adjointtensorindices(t, p₁)
+    perm = adjointtensorindices(adjoint(t), ntuple(identity, numind(t)))
+    levels′ = TupleTools.getindices(levels, perm)
+    return adjoint(braid(adjoint(t), (p₁′, p₂′), levels′; kwargs...))
+end
 
 # ----------------
 #   transpose(!)
diff --git a/test/tensors/indexmanipulations.jl b/test/tensors/indexmanipulations.jl
index c38b182a2..836418b3f 100644
--- a/test/tensors/indexmanipulations.jl
+++ b/test/tensors/indexmanipulations.jl
@@ -129,6 +129,14 @@ for V in spacelist
             @tensor tb[a, b] := flip(t1, (1, 3))[x, y, a, z] * flip(t2, (2, 4))[y, b, z, x]
             @test flip(ta, (1, 2)) ≈ tb
         end
+        hasbraiding && !symmetricbraiding && @timedtestset "Braid AdjointTensorMap: adjoint identity" begin
+            t = rand(ComplexF64, V1 ⊗ V2 ← V3)
+            p = ((2,), (1, 3))
+            levels = (1, 3, 2)
+            t1 = copy(braid(t', p, levels))
+            t2 = braid(copy(t'), p, levels)
+            @test t1 ≈ t2
+        end
     end
     TensorKit.empty_globalcaches!()
 end

From aeeb31827bc78edceaed37aff6774b386812ef05 Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Fri, 24 Apr 2026 14:47:02 -0400
Subject: [PATCH 06/33] rework `add_transform` kernels for TensorMap to only
 take data vector

---
 src/tensors/indexmanipulations.jl | 114 ++++++++++++++++--------------
 src/tensors/treetransformers.jl   |   7 ++
 2 files changed, 67 insertions(+), 54 deletions(-)

diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl
index 21af7ce93..bb46b1690 100644
--- a/src/tensors/indexmanipulations.jl
+++ b/src/tensors/indexmanipulations.jl
@@ -564,6 +564,34 @@ end
 
     return tdst
 end
+@propagate_inbounds function add_transform!(
+        tdst::TensorMap, tsrc::TensorMap, p::Index2Tuple, transformer::TreeTransformer,
+        α::Number, β::Number, backend, allocator
+    )
+    @boundscheck spacecheck_transform(permute, tdst, tsrc, p)
+    if p[1] === codomainind(tsrc) && p[2] === domainind(tsrc)
+        add!(tdst, tsrc, α, β)
+        return tdst
+    end
+    if use_threaded_transform(tdst, transformer)
+        add_kernel_threaded!(tdst.data, tsrc.data, p, transformer, α, β, backend, allocator)
+    else
+        add_kernel_nonthreaded!(tdst.data, tsrc.data, p, transformer, α, β, backend, allocator)
+    end
+    return tdst
+end
+@propagate_inbounds function add_transform!(
+        tdst::TensorMap, tsrc::TensorMap, p::Index2Tuple, ::TrivialTreeTransformer,
+        α::Number, β::Number, backend, allocator
+    )
+    @boundscheck spacecheck_transform(permute, tdst, tsrc, p)
+    if p[1] === codomainind(tsrc) && p[2] === domainind(tsrc)
+        add!(tdst, tsrc, α, β)
+        return tdst
+    end
+    TO.tensoradd!(tdst[], tsrc[], p, false, α, β, backend, allocator)
+    return tdst
+end
 
 function use_threaded_transform(t::TensorMap, transformer)
     return get_num_transformer_threads() > 1 && length(t.data) > Strided.MINTHREADLENGTH
@@ -572,13 +600,6 @@ function use_threaded_transform(t::AbstractTensorMap, transformer)
     return get_num_transformer_threads() > 1 && dim(space(t)) > Strided.MINTHREADLENGTH
 end
 
-# Trivial implementations
-# -----------------------
-function add_trivial_kernel!(tdst, tsrc, p, transformer, α, β, backend, allocator)
-    TO.tensoradd!(tdst[], tsrc[], p, false, α, β, backend, allocator)
-    return nothing
-end
-
 # Non-threaded implementations
 # ----------------------------
 function add_kernel_nonthreaded!(
@@ -590,11 +611,11 @@ function add_kernel_nonthreaded!(
     return nothing
 end
 function add_kernel_nonthreaded!(
-        ::UniqueFusion, tdst, tsrc, p, transformer::AbelianTreeTransformer, α, β, backend,
-        allocator
+        data_dst::DenseVector, data_src::DenseVector, p, transformer::AbelianTreeTransformer,
+        α, β, backend, allocator
     )
     for subtransformer in transformer.data
-        _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend, allocator)
+        _add_transform_single!(data_dst, data_src, p, subtransformer, α, β, backend, allocator)
     end
     return nothing
 end
@@ -615,29 +636,21 @@ function add_kernel_nonthreaded!(
 end
 # specialization in the case of TensorMap
 function add_kernel_nonthreaded!(
-        ::FusionStyle, tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend,
-        allocator
+        data_dst::DenseVector, data_src::DenseVector, p, transformer::GenericTreeTransformer,
+        α, β, backend, allocator
     )
-    # preallocate buffers
-    buffers = allocate_buffers(tdst, tsrc, transformer, allocator)
+    buffers = allocate_buffers(data_dst, data_src, transformer, allocator)
 
     for subtransformer in transformer.data
         # Special case without intermediate buffers whenever there is only a single block
         if length(subtransformer[1]) == 1
-            _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend, allocator)
+            _add_transform_single!(data_dst, data_src, p, subtransformer, α, β, backend, allocator)
         else
-            _add_transform_multi!(tdst, tsrc, p, subtransformer, buffers, α, β, backend, allocator)
+            _add_transform_multi!(data_dst, data_src, p, subtransformer, buffers, α, β, backend, allocator)
         end
     end
     return nothing
 end
-# ambiguity resolution
-function add_kernel_nonthreaded!(
-        ::UniqueFusion, tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend,
-        allocator
-    )
-    throw(ArgumentError("Cannot combine `GenericTreeTransformer` with `UniqueFusion`"))
-end
 # Threaded implementations
 # ------------------------
 function add_kernel_threaded!(
@@ -660,8 +673,8 @@ function add_kernel_threaded!(
     return nothing
 end
 function add_kernel_threaded!(
-        ::UniqueFusion, tdst, tsrc, p, transformer::AbelianTreeTransformer, α, β, backend,
-        allocator; ntasks::Int = get_num_transformer_threads()
+        data_dst::DenseVector, data_src::DenseVector, p, transformer::AbelianTreeTransformer,
+        α, β, backend, allocator; ntasks::Int = get_num_transformer_threads()
     )
     nblocks = length(transformer.data)
     counter = Threads.Atomic{Int}(1)
@@ -671,7 +684,7 @@ function add_kernel_threaded!(
                 local_counter = Threads.atomic_add!(counter, 1)
                 local_counter > nblocks && break
                 @inbounds subtransformer = transformer.data[local_counter]
-                _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend, allocator)
+                _add_transform_single!(data_dst, data_src, p, subtransformer, α, β, backend, allocator)
             end
         end
     end
@@ -708,8 +721,8 @@ function add_kernel_threaded!(
 end
 # specialization in the case of TensorMap
 function add_kernel_threaded!(
-        ::FusionStyle, tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend,
-        allocator; ntasks::Int = get_num_transformer_threads()
+        data_dst::DenseVector, data_src::DenseVector, p, transformer::GenericTreeTransformer,
+        α, β, backend, allocator; ntasks::Int = get_num_transformer_threads()
     )
     nblocks = length(transformer.data)
 
@@ -717,16 +730,16 @@ function add_kernel_threaded!(
     Threads.@sync for _ in 1:min(ntasks, nblocks)
         Threads.@spawn begin
             # preallocate buffers for each task
-            buffers = allocate_buffers(tdst, tsrc, transformer, allocator)
+            buffers = allocate_buffers(data_dst, data_src, transformer, allocator)
 
             while true
                 local_counter = Threads.atomic_add!(counter, 1)
                 local_counter > nblocks && break
                 @inbounds subtransformer = transformer.data[local_counter]
                 if length(subtransformer[1]) == 1
-                    _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend, allocator)
+                    _add_transform_single!(data_dst, data_src, p, subtransformer, α, β, backend, allocator)
                 else
-                    _add_transform_multi!(tdst, tsrc, p, subtransformer, buffers, α, β, backend, allocator)
+                    _add_transform_multi!(data_dst, data_src, p, subtransformer, buffers, α, β, backend, allocator)
                 end
             end
         end
@@ -734,13 +747,6 @@ function add_kernel_threaded!(
 
     return nothing
 end
-# ambiguity resolution
-function add_kernel_threaded!(
-        ::UniqueFusion, tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend,
-        allocator; ntasks::Int = get_num_transformer_threads()
-    )
-    throw(ArgumentError("Cannot combine `GenericTreeTransformer` with `UniqueFusion`"))
-end
 
 
 # Auxiliary methods
@@ -763,22 +769,24 @@ function _add_transform_single!(
     return nothing
 end
 function _add_transform_single!(
-        tdst, tsrc, p, (coeff, struct_dst, struct_src)::AbelianTransformerData,
+        data_dst::DenseVector, data_src::DenseVector, p,
+        (coeff, struct_dst, struct_src)::AbelianTransformerData,
         α, β, backend, allocator
     )
-    subblock_dst = StridedView(tdst.data, struct_dst...)
-    subblock_src = StridedView(tsrc.data, struct_src...)
+    subblock_dst = StridedView(data_dst, struct_dst...)
+    subblock_src = StridedView(data_src, struct_src...)
     TO.tensoradd!(subblock_dst, subblock_src, p, false, α * coeff, β, backend, allocator)
     return nothing
 end
 function _add_transform_single!(
-        tdst, tsrc, p, (basistransform, structs_dst, structs_src)::GenericTransformerData,
+        data_dst::DenseVector, data_src::DenseVector, p,
+        (basistransform, structs_dst, structs_src)::GenericTransformerData,
         α, β, backend, allocator
     )
     struct_dst = (structs_dst[1], only(structs_dst[2])...)
     struct_src = (structs_src[1], only(structs_src[2])...)
     coeff = only(basistransform)
-    _add_transform_single!(tdst, tsrc, p, (coeff, struct_dst, struct_src), α, β, backend, allocator)
+    _add_transform_single!(data_dst, data_src, p, (coeff, struct_dst, struct_src), α, β, backend, allocator)
     return nothing
 end
 
@@ -817,33 +825,31 @@ function _add_transform_multi!(
     return nothing
 end
 function _add_transform_multi!(
-        tdst, tsrc, p, (U, (sz_dst, structs_dst), (sz_src, structs_src)),
+        data_dst::DenseVector, data_src::DenseVector, p,
+        (U, (sz_dst, structs_dst), (sz_src, structs_src))::GenericTransformerData,
         (buffer1, buffer2), α, β, backend, allocator
     )
     rows, cols = size(U)
     blocksize = prod(sz_src)
-    matsize = (
-        prod(TupleTools.getindices(sz_src, codomainind(tsrc))),
-        prod(TupleTools.getindices(sz_src, domainind(tsrc))),
-    )
 
     # Filling up a buffer with contiguous data
     buffer_src = StridedView(buffer2, (blocksize, cols), (1, blocksize), 0)
+    ptriv = (ntuple(identity, length(sz_src)), ())
     for (i, struct_src) in enumerate(structs_src)
-        subblock_src = sreshape(StridedView(tsrc.data, sz_src, struct_src...), matsize)
-        bufblock_src = sreshape(buffer_src[:, i], matsize)
-        copy!(bufblock_src, subblock_src)
+        subblock_src = StridedView(data_src, sz_src, struct_src...)
+        bufblock_src = sreshape(buffer_src[:, i], sz_src)
+        TO.tensoradd!(bufblock_src, subblock_src, ptriv, false, One(), Zero(), backend, allocator)
     end
 
     # Resummation into a second buffer using BLAS
     buffer_dst = StridedView(buffer1, (blocksize, rows), (1, blocksize), 0)
-    mul!(buffer_dst, buffer_src, transpose(StridedView(U)), α, Zero())
+    mul!(buffer_dst, buffer_src, transpose(StridedView(U)))
 
     # Filling up the output
     for (i, struct_dst) in enumerate(structs_dst)
-        subblock_dst = StridedView(tdst.data, sz_dst, struct_dst...)
+        subblock_dst = StridedView(data_dst, sz_dst, struct_dst...)
         bufblock_dst = sreshape(buffer_dst[:, i], sz_src)
-        TO.tensoradd!(subblock_dst, bufblock_dst, p, false, One(), β, backend, allocator)
+        TO.tensoradd!(subblock_dst, bufblock_dst, p, false, α, β, backend, allocator)
     end
 
     return nothing
diff --git a/src/tensors/treetransformers.jl b/src/tensors/treetransformers.jl
index 8c63b00a8..82032b067 100644
--- a/src/tensors/treetransformers.jl
+++ b/src/tensors/treetransformers.jl
@@ -141,6 +141,13 @@ function allocate_buffers(
     sz = buffersize(transformer)
     return similar(tdst.data, sz), similar(tsrc.data, sz)
 end
+function allocate_buffers(
+        data_dst::DenseVector, data_src::DenseVector, transformer::GenericTreeTransformer,
+        allocator=TO.DefaultAllocator()
+    )
+    sz = buffersize(transformer)
+    return similar(data_dst, sz), similar(data_src, sz)
+end
 function allocate_buffers(
         tdst::AbstractTensorMap, tsrc::AbstractTensorMap, transformer,
         allocator=TO.DefaultAllocator()

From 687858a31269b95ad45891c17523cbc421acb3ce Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Fri, 24 Apr 2026 20:05:58 -0400
Subject: [PATCH 07/33] inline transform helpers into add_transform_kernel!

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/tensors/indexmanipulations.jl | 334 ++++++------------------------
 1 file changed, 65 insertions(+), 269 deletions(-)

diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl
index bb46b1690..1909e5d1f 100644
--- a/src/tensors/indexmanipulations.jl
+++ b/src/tensors/indexmanipulations.jl
@@ -551,47 +551,20 @@ end
     else
         I = sectortype(tdst)
         if I === Trivial
-            add_trivial_kernel!(tdst, tsrc, p, transformer, α, β, backend, allocator)
+            TO.tensoradd!(tdst[], tsrc[], p, false, α, β, backend, allocator)
         else
-            style = FusionStyle(I)
-            if use_threaded_transform(tdst, transformer)
-                add_kernel_threaded!(style, tdst, tsrc, p, transformer, α, β, backend, allocator)
+            ntasks = use_threaded_transform(tdst, transformer) ? get_num_transformer_threads() : 1
+            scheduler = ntasks == 1 ? SerialScheduler() : DynamicScheduler(; ntasks, split = :roundrobin)
+            if tdst isa TensorMap && tsrc isa TensorMap # unpack data fields to avoid specializing
+                add_transform_kernel!(tdst.data, tsrc.data, p, transformer, α, β, backend, allocator, scheduler)
             else
-                add_kernel_nonthreaded!(style, tdst, tsrc, p, transformer, α, β, backend, allocator)
+                add_transform_kernel!(tdst, tsrc, p, transformer, α, β, backend, allocator, scheduler)
             end
         end
     end
 
     return tdst
 end
-@propagate_inbounds function add_transform!(
-        tdst::TensorMap, tsrc::TensorMap, p::Index2Tuple, transformer::TreeTransformer,
-        α::Number, β::Number, backend, allocator
-    )
-    @boundscheck spacecheck_transform(permute, tdst, tsrc, p)
-    if p[1] === codomainind(tsrc) && p[2] === domainind(tsrc)
-        add!(tdst, tsrc, α, β)
-        return tdst
-    end
-    if use_threaded_transform(tdst, transformer)
-        add_kernel_threaded!(tdst.data, tsrc.data, p, transformer, α, β, backend, allocator)
-    else
-        add_kernel_nonthreaded!(tdst.data, tsrc.data, p, transformer, α, β, backend, allocator)
-    end
-    return tdst
-end
-@propagate_inbounds function add_transform!(
-        tdst::TensorMap, tsrc::TensorMap, p::Index2Tuple, ::TrivialTreeTransformer,
-        α::Number, β::Number, backend, allocator
-    )
-    @boundscheck spacecheck_transform(permute, tdst, tsrc, p)
-    if p[1] === codomainind(tsrc) && p[2] === domainind(tsrc)
-        add!(tdst, tsrc, α, β)
-        return tdst
-    end
-    TO.tensoradd!(tdst[], tsrc[], p, false, α, β, backend, allocator)
-    return tdst
-end
 
 function use_threaded_transform(t::TensorMap, transformer)
     return get_num_transformer_threads() > 1 && length(t.data) > Strided.MINTHREADLENGTH
@@ -600,257 +573,80 @@ function use_threaded_transform(t::AbstractTensorMap, transformer)
     return get_num_transformer_threads() > 1 && dim(space(t)) > Strided.MINTHREADLENGTH
 end
 
-# Non-threaded implementations
-# ----------------------------
-function add_kernel_nonthreaded!(
-        ::UniqueFusion, tdst, tsrc, p, transformer, α, β, backend, allocator
+function add_transform_kernel!(
+        tdst, tsrc, p, transformer, α, β, backend, allocator, scheduler
     )
-    for (f₁, f₂) in fusiontrees(tsrc)
-        _add_transform_single!(tdst, tsrc, p, (f₁, f₂), transformer, α, β, backend, allocator)
-    end
-    return nothing
-end
-function add_kernel_nonthreaded!(
-        data_dst::DenseVector, data_src::DenseVector, p, transformer::AbelianTreeTransformer,
-        α, β, backend, allocator
-    )
-    for subtransformer in transformer.data
-        _add_transform_single!(data_dst, data_src, p, subtransformer, α, β, backend, allocator)
-    end
-    return nothing
-end
-function add_kernel_nonthreaded!(
-        ::FusionStyle, tdst, tsrc, p, transformer, α, β, backend, allocator
-    )
-    # preallocate buffers
-    buffers = allocate_buffers(tdst, tsrc, transformer, allocator)
-
-    for src in fusionblocks(tsrc)
-        if length(src) == 1
-            _add_transform_single!(tdst, tsrc, p, src, transformer, α, β, backend, allocator)
-        else
-            _add_transform_multi!(tdst, tsrc, p, src, transformer, buffers, α, β, backend, allocator)
+    I = sectortype(tdst)
+    if FusionStyle(I) === UniqueFusion()
+        tforeach(fusiontrees(tsrc); scheduler) do (f₁, f₂)
+            (f₁′, f₂′), coeff = transformer((f₁, f₂))
+            @inbounds TO.tensoradd!(tdst[f₁′, f₂′], tsrc[f₁, f₂], p, false, α * coeff, β, backend, allocator)
         end
-    end
-    return nothing
-end
-# specialization in the case of TensorMap
-function add_kernel_nonthreaded!(
-        data_dst::DenseVector, data_src::DenseVector, p, transformer::GenericTreeTransformer,
-        α, β, backend, allocator
-    )
-    buffers = allocate_buffers(data_dst, data_src, transformer, allocator)
-
-    for subtransformer in transformer.data
-        # Special case without intermediate buffers whenever there is only a single block
-        if length(subtransformer[1]) == 1
-            _add_transform_single!(data_dst, data_src, p, subtransformer, α, β, backend, allocator)
-        else
-            _add_transform_multi!(data_dst, data_src, p, subtransformer, buffers, α, β, backend, allocator)
-        end
-    end
-    return nothing
-end
-# Threaded implementations
-# ------------------------
-function add_kernel_threaded!(
-        ::UniqueFusion, tdst, tsrc, p, transformer, α, β, backend, allocator;
-        ntasks::Int = get_num_transformer_threads()
-    )
-    trees = fusiontrees(tsrc)
-    nblocks = length(trees)
-    counter = Threads.Atomic{Int}(1)
-    Threads.@sync for _ in 1:min(ntasks, nblocks)
-        Threads.@spawn begin
-            while true
-                local_counter = Threads.atomic_add!(counter, 1)
-                local_counter > nblocks && break
-                @inbounds (f₁, f₂) = trees[local_counter]
-                _add_transform_single!(tdst, tsrc, p, (f₁, f₂), transformer, α, β, backend, allocator)
-            end
-        end
-    end
-    return nothing
-end
-function add_kernel_threaded!(
-        data_dst::DenseVector, data_src::DenseVector, p, transformer::AbelianTreeTransformer,
-        α, β, backend, allocator; ntasks::Int = get_num_transformer_threads()
-    )
-    nblocks = length(transformer.data)
-    counter = Threads.Atomic{Int}(1)
-    Threads.@sync for _ in 1:min(ntasks, nblocks)
-        Threads.@spawn begin
-            while true
-                local_counter = Threads.atomic_add!(counter, 1)
-                local_counter > nblocks && break
-                @inbounds subtransformer = transformer.data[local_counter]
-                _add_transform_single!(data_dst, data_src, p, subtransformer, α, β, backend, allocator)
+    else
+        tl_buffers = OhMyThreads.TaskLocalValue(() -> allocate_buffers(tdst, tsrc, transformer, allocator))
+        tforeach(fusionblocks(tsrc); scheduler) do src
+            dst, U = transformer(src)
+            if length(src) == 1
+                (f₁, f₂) = only(fusiontrees(src))
+                (f₁′, f₂′) = only(fusiontrees(dst))
+                @inbounds TO.tensoradd!(tdst[f₁′, f₂′], tsrc[f₁, f₂], p, false, α * only(U), β, backend, allocator)
+            else
+                buffer1, buffer2 = tl_buffers[]
+                rows, cols = size(U)
+                sz_src = size(tsrc[first(fusiontrees(src))...])
+                blocksize = prod(sz_src)
+                ptriv = (ntuple(identity, length(sz_src)), ())
+                buffer_src = StridedView(buffer2, (blocksize, cols), (1, blocksize), 0)
+                for (i, (f₁, f₂)) in enumerate(fusiontrees(src))
+                    TO.tensoradd!(sreshape(buffer_src[:, i], sz_src), tsrc[f₁, f₂], ptriv, false, One(), Zero(), backend, allocator)
+                end
+                buffer_dst = StridedView(buffer1, (blocksize, rows), (1, blocksize), 0)
+                mul!(buffer_dst, buffer_src, transpose(StridedView(U)))
+                for (i, (f₃, f₄)) in enumerate(fusiontrees(dst))
+                    TO.tensoradd!(tdst[f₃, f₄], sreshape(buffer_dst[:, i], sz_src), p, false, α, β, backend, allocator)
+                end
             end
         end
     end
     return nothing
 end
 
-function add_kernel_threaded!(
-        ::FusionStyle, tdst, tsrc, p, transformer, α, β, backend, allocator;
-        ntasks::Int = get_num_transformer_threads()
+# specialization in the case of TensorMap
+function add_transform_kernel!(
+        data_dst::DenseVector, data_src::DenseVector, p, transformer::AbelianTreeTransformer,
+        α, β, backend, allocator, scheduler
     )
-    allblocks = fusionblocks(tsrc)
-    nblocks = length(allblocks)
-
-    counter = Threads.Atomic{Int}(1)
-    Threads.@sync for _ in 1:min(ntasks, nblocks)
-        Threads.@spawn begin
-            # preallocate buffers for each task
-            buffers = allocate_buffers(tdst, tsrc, transformer, allocator)
-
-            while true
-                local_counter = Threads.atomic_add!(counter, 1)
-                local_counter > nblocks && break
-                @inbounds src = allblocks[local_counter]
-                if length(src) == 1
-                    _add_transform_single!(tdst, tsrc, p, src, transformer, α, β, backend, allocator)
-                else
-                    _add_transform_multi!(tdst, tsrc, p, src, transformer, buffers, α, β, backend, allocator)
-                end
-            end
-        end
+    tforeach(transformer.data; scheduler) do (coeff, struct_dst, struct_src)
+        TO.tensoradd!(StridedView(data_dst, struct_dst...), StridedView(data_src, struct_src...), p, false, α * coeff, β, backend, allocator)
     end
-
     return nothing
 end
-# specialization in the case of TensorMap
-function add_kernel_threaded!(
+function add_transform_kernel!(
         data_dst::DenseVector, data_src::DenseVector, p, transformer::GenericTreeTransformer,
-        α, β, backend, allocator; ntasks::Int = get_num_transformer_threads()
-    )
-    nblocks = length(transformer.data)
-
-    counter = Threads.Atomic{Int}(1)
-    Threads.@sync for _ in 1:min(ntasks, nblocks)
-        Threads.@spawn begin
-            # preallocate buffers for each task
-            buffers = allocate_buffers(data_dst, data_src, transformer, allocator)
-
-            while true
-                local_counter = Threads.atomic_add!(counter, 1)
-                local_counter > nblocks && break
-                @inbounds subtransformer = transformer.data[local_counter]
-                if length(subtransformer[1]) == 1
-                    _add_transform_single!(data_dst, data_src, p, subtransformer, α, β, backend, allocator)
-                else
-                    _add_transform_multi!(data_dst, data_src, p, subtransformer, buffers, α, β, backend, allocator)
-                end
+        α, β, backend, allocator, scheduler
+    )
+    tl_buffers = OhMyThreads.TaskLocalValue(() -> allocate_buffers(data_dst, data_src, transformer, allocator))
+    tforeach(transformer.data; scheduler) do (U, (sz_dst, structs_dst), (sz_src, structs_src))
+        if length(U) == 1
+            coeff = only(U)
+            TO.tensoradd!(StridedView(data_dst, sz_dst, only(structs_dst)...),
+                          StridedView(data_src, sz_src, only(structs_src)...),
+                          p, false, α * coeff, β, backend, allocator)
+        else
+            buffer1, buffer2 = tl_buffers[]
+            rows, cols = size(U)
+            blocksize = prod(sz_src)
+            ptriv = (ntuple(identity, length(sz_src)), ())
+            buffer_src = StridedView(buffer2, (blocksize, cols), (1, blocksize), 0)
+            for (i, struct_src_i) in enumerate(structs_src)
+                TO.tensoradd!(sreshape(buffer_src[:, i], sz_src), StridedView(data_src, sz_src, struct_src_i...), ptriv, false, One(), Zero(), backend, allocator)
+            end
+            buffer_dst = StridedView(buffer1, (blocksize, rows), (1, blocksize), 0)
+            mul!(buffer_dst, buffer_src, transpose(StridedView(U)))
+            for (i, struct_dst_i) in enumerate(structs_dst)
+                TO.tensoradd!(StridedView(data_dst, sz_dst, struct_dst_i...), sreshape(buffer_dst[:, i], sz_src), p, false, α, β, backend, allocator)
             end
         end
     end
-
-    return nothing
-end
-
-
-# Auxiliary methods
-# -----------------
-function _add_transform_single!(
-        tdst, tsrc, p, (f₁, f₂)::FusionTreePair, transformer, α, β, backend, allocator
-    )
-    (f₁′, f₂′), coeff = transformer((f₁, f₂))
-    @inbounds TO.tensoradd!(tdst[f₁′, f₂′], tsrc[f₁, f₂], p, false, α * coeff, β, backend, allocator)
-    return nothing
-end
-function _add_transform_single!(
-        tdst, tsrc, p, src::FusionTreeBlock, transformer, α, β, backend, allocator
-    )
-    dst, U = transformer(src)
-    f₁, f₂ = only(fusiontrees(src))
-    f₁′, f₂′ = only(fusiontrees(dst))
-    coeff = only(U)
-    @inbounds TO.tensoradd!(tdst[f₁′, f₂′], tsrc[f₁, f₂], p, false, α * coeff, β, backend, allocator)
-    return nothing
-end
-function _add_transform_single!(
-        data_dst::DenseVector, data_src::DenseVector, p,
-        (coeff, struct_dst, struct_src)::AbelianTransformerData,
-        α, β, backend, allocator
-    )
-    subblock_dst = StridedView(data_dst, struct_dst...)
-    subblock_src = StridedView(data_src, struct_src...)
-    TO.tensoradd!(subblock_dst, subblock_src, p, false, α * coeff, β, backend, allocator)
-    return nothing
-end
-function _add_transform_single!(
-        data_dst::DenseVector, data_src::DenseVector, p,
-        (basistransform, structs_dst, structs_src)::GenericTransformerData,
-        α, β, backend, allocator
-    )
-    struct_dst = (structs_dst[1], only(structs_dst[2])...)
-    struct_src = (structs_src[1], only(structs_src[2])...)
-    coeff = only(basistransform)
-    _add_transform_single!(data_dst, data_src, p, (coeff, struct_dst, struct_src), α, β, backend, allocator)
-    return nothing
-end
-
-function _add_transform_multi!(
-        tdst, tsrc, p, src::FusionTreeBlock, transformer, (buffer1, buffer2), α, β, backend,
-        allocator
-    )
-    dst, U = transformer(src)
-    rows, cols = size(U)
-    sz_src = size(tsrc[first(fusiontrees(src))...])
-    blocksize = prod(sz_src)
-    matsize = (
-        prod(TupleTools.getindices(sz_src, codomainind(tsrc))),
-        prod(TupleTools.getindices(sz_src, domainind(tsrc))),
-    )
-
-    # Filling up a buffer with contiguous data
-    buffer_src = StridedView(buffer2, (blocksize, cols), (1, blocksize), 0)
-    for (i, (f₁, f₂)) in enumerate(fusiontrees(src))
-        subblock_src = sreshape(tsrc[f₁, f₂], matsize)
-        bufblock_src = sreshape(buffer_src[:, i], matsize)
-        copy!(bufblock_src, subblock_src)
-    end
-
-    # Resummation into a second buffer using BLAS
-    buffer_dst = StridedView(buffer1, (blocksize, rows), (1, blocksize), 0)
-    mul!(buffer_dst, buffer_src, transpose(StridedView(U)), α, Zero())
-
-    # Filling up the output
-    for (i, (f₃, f₄)) in enumerate(fusiontrees(dst))
-        subblock_dst = tdst[f₃, f₄]
-        bufblock_dst = sreshape(buffer_dst[:, i], sz_src)
-        TO.tensoradd!(subblock_dst, bufblock_dst, p, false, One(), β, backend, allocator)
-    end
-
-    return nothing
-end
-function _add_transform_multi!(
-        data_dst::DenseVector, data_src::DenseVector, p,
-        (U, (sz_dst, structs_dst), (sz_src, structs_src))::GenericTransformerData,
-        (buffer1, buffer2), α, β, backend, allocator
-    )
-    rows, cols = size(U)
-    blocksize = prod(sz_src)
-
-    # Filling up a buffer with contiguous data
-    buffer_src = StridedView(buffer2, (blocksize, cols), (1, blocksize), 0)
-    ptriv = (ntuple(identity, length(sz_src)), ())
-    for (i, struct_src) in enumerate(structs_src)
-        subblock_src = StridedView(data_src, sz_src, struct_src...)
-        bufblock_src = sreshape(buffer_src[:, i], sz_src)
-        TO.tensoradd!(bufblock_src, subblock_src, ptriv, false, One(), Zero(), backend, allocator)
-    end
-
-    # Resummation into a second buffer using BLAS
-    buffer_dst = StridedView(buffer1, (blocksize, rows), (1, blocksize), 0)
-    mul!(buffer_dst, buffer_src, transpose(StridedView(U)))
-
-    # Filling up the output
-    for (i, struct_dst) in enumerate(structs_dst)
-        subblock_dst = StridedView(data_dst, sz_dst, struct_dst...)
-        bufblock_dst = sreshape(buffer_dst[:, i], sz_src)
-        TO.tensoradd!(subblock_dst, bufblock_dst, p, false, α, β, backend, allocator)
-    end
-
     return nothing
 end

From e7c774de3dea98cc85deb6e1018516ad427fd607 Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Sat, 25 Apr 2026 21:15:23 -0400
Subject: [PATCH 08/33] make deprecations simpler

---
 src/tensors/indexmanipulations.jl | 61 +++++++++----------------------
 1 file changed, 17 insertions(+), 44 deletions(-)

diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl
index 1909e5d1f..91ded3140 100644
--- a/src/tensors/indexmanipulations.jl
+++ b/src/tensors/indexmanipulations.jl
@@ -498,47 +498,18 @@ end
 
 # Deprecated add_*! wrappers
 # --------------------------
-"""
-    add_permute!(tdst, tsrc, (p₁, p₂)::Index2Tuple, α::Number, β::Number[, backend])
-
-!!! warning "Deprecated"
-    `add_permute!` is deprecated. Use `permute!(tdst, tsrc, p, α, β[, backend])` instead.
-"""
-function add_permute!(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple,
-        α::Number, β::Number, backend::AbstractBackend...
-    )
-    Base.depwarn("`add_permute!` is deprecated, use `permute!` instead", :add_permute!)
-    return @inbounds permute!(tdst, tsrc, p, α, β, backend...)
-end
-
-"""
-    add_braid!(tdst, tsrc, (p₁, p₂)::Index2Tuple, levels::IndexTuple, α::Number, β::Number[, backend])
-
-!!! warning "Deprecated"
-    `add_braid!` is deprecated. Use `braid!(tdst, tsrc, p, levels, α, β[, backend])` instead.
-"""
-function add_braid!(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, levels::IndexTuple,
-        α::Number, β::Number, backend::AbstractBackend...
-    )
-    Base.depwarn("`add_braid!` is deprecated, use `braid!` instead", :add_braid!)
-    return @inbounds braid!(tdst, tsrc, p, levels, α, β, backend...)
-end
-
-"""
-    add_transpose!(tdst, tsrc, (p₁, p₂)::Index2Tuple, α::Number, β::Number[, backend])
-
-!!! warning "Deprecated"
-    `add_transpose!` is deprecated. Use `transpose!(tdst, tsrc, p, α, β[, backend])` instead.
-"""
-function add_transpose!(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple,
-        α::Number, β::Number, backend::AbstractBackend...
-    )
-    Base.depwarn("`add_transpose!` is deprecated, use `transpose!` instead", :add_transpose!)
-    return @inbounds transpose!(tdst, tsrc, p, α, β, backend...)
-end
+Base.@deprecate(
+    add_permute!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, α::Number, β::Number, backend::AbstractBackend...),
+    permute!(tdst, tsrc, p, α, β, backend...)
+)
+Base.@deprecate(
+    add_braid!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, levels::IndexTuple, α::Number, β::Number, backend::AbstractBackend...),
+    braid!(tdst, tsrc, p, levels, α, β, backend...)
+)
+Base.@deprecate(
+    add_transpose!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, α::Number, β::Number, backend::AbstractBackend...),
+    transpose!(tdst, tsrc, p, α, β, backend...)
+)
 
 @propagate_inbounds function add_transform!(
         tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, transformer,
@@ -629,9 +600,11 @@ function add_transform_kernel!(
     tforeach(transformer.data; scheduler) do (U, (sz_dst, structs_dst), (sz_src, structs_src))
         if length(U) == 1
             coeff = only(U)
-            TO.tensoradd!(StridedView(data_dst, sz_dst, only(structs_dst)...),
-                          StridedView(data_src, sz_src, only(structs_src)...),
-                          p, false, α * coeff, β, backend, allocator)
+            TO.tensoradd!(
+                StridedView(data_dst, sz_dst, only(structs_dst)...),
+                StridedView(data_src, sz_src, only(structs_src)...),
+                p, false, α * coeff, β, backend, allocator
+            )
         else
             buffer1, buffer2 = tl_buffers[]
             rows, cols = size(U)

From 3d90e3ef6deaad2a5144677c9f7a7e37827ccec6 Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Sat, 25 Apr 2026 21:25:24 -0400
Subject: [PATCH 09/33] refactor and explain

---
 src/tensors/indexmanipulations.jl | 85 ++++++++++++++++++++++++++-----
 1 file changed, 73 insertions(+), 12 deletions(-)

diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl
index 91ded3140..4170f5fab 100644
--- a/src/tensors/indexmanipulations.jl
+++ b/src/tensors/indexmanipulations.jl
@@ -515,8 +515,14 @@ Base.@deprecate(
         tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, transformer,
         α::Number, β::Number, backend, allocator
     )
+    # `permute` is used as a stand-in for all index rearrangements here: permute, braid, and
+    # transpose all produce the same destination space for a given permutation tuple `p`.
     @boundscheck spacecheck_transform(permute, tdst, tsrc, p)
 
+    # Three cases, from cheapest to most expensive:
+    #   1. trivial permutation: delegate to `add!` which handles α/β scaling directly
+    #   2. Trivial sector type: no fusion tree bookkeeping, call tensoradd! on the raw array
+    #   3. general case: iterate over (blocks of) fusion trees, potentially multi-threaded
     if p[1] === codomainind(tsrc) && p[2] === domainind(tsrc)
         add!(tdst, tsrc, α, β)
     else
@@ -549,32 +555,57 @@ function add_transform_kernel!(
     )
     I = sectortype(tdst)
     if FusionStyle(I) === UniqueFusion()
+        # Abelian / unique-fusion: each source fusion tree pair (f₁, f₂) maps to exactly
+        # one destination pair (f₁′, f₂′) with a scalar coefficient. No mixing occurs.
         tforeach(fusiontrees(tsrc); scheduler) do (f₁, f₂)
             (f₁′, f₂′), coeff = transformer((f₁, f₂))
-            @inbounds TO.tensoradd!(tdst[f₁′, f₂′], tsrc[f₁, f₂], p, false, α * coeff, β, backend, allocator)
+            @inbounds TO.tensoradd!(
+                tdst[f₁′, f₂′], tsrc[f₁, f₂],
+                p, false, α * coeff, β, backend, allocator
+            )
         end
     else
+        # Non-Abelian fusion: trees sharing the same set of uncoupled (external) sectors
+        # form a *fusion block* and mix under the transformation via a recoupling matrix U
+        # (rows = destination trees, columns = source trees). We iterate over blocks.
         tl_buffers = OhMyThreads.TaskLocalValue(() -> allocate_buffers(tdst, tsrc, transformer, allocator))
         tforeach(fusionblocks(tsrc); scheduler) do src
             dst, U = transformer(src)
             if length(src) == 1
+                # Degenerate block: single tree, U is a 1×1 scalar — skip the buffer + matmul.
                 (f₁, f₂) = only(fusiontrees(src))
                 (f₁′, f₂′) = only(fusiontrees(dst))
-                @inbounds TO.tensoradd!(tdst[f₁′, f₂′], tsrc[f₁, f₂], p, false, α * only(U), β, backend, allocator)
+                @inbounds TO.tensoradd!(
+                    tdst[f₁′, f₂′], tsrc[f₁, f₂],
+                    p, false, α * only(U), β, backend, allocator
+                )
             else
+                # Multi-tree block: apply recoupling via a three-step pack → matmul → unpack.
+                #   1. Extract: flatten each source block into a column of buffer_src
+                #      (shape blocksize × cols), using a trivial permutation so that the
+                #      index layout is canonical before the matmul.
+                #   2. Recoupling: buffer_dst = buffer_src * U^T  (blocksize × rows)
+                #   3. Insert: scatter columns of buffer_dst to destination blocks,
+                #      applying the actual permutation p in the same step.
                 buffer1, buffer2 = tl_buffers[]
                 rows, cols = size(U)
                 sz_src = size(tsrc[first(fusiontrees(src))...])
                 blocksize = prod(sz_src)
                 ptriv = (ntuple(identity, length(sz_src)), ())
                 buffer_src = StridedView(buffer2, (blocksize, cols), (1, blocksize), 0)
-                for (i, (f₁, f₂)) in enumerate(fusiontrees(src))
-                    TO.tensoradd!(sreshape(buffer_src[:, i], sz_src), tsrc[f₁, f₂], ptriv, false, One(), Zero(), backend, allocator)
+                @inbounds for (i, (f₁, f₂)) in enumerate(fusiontrees(src))
+                    TO.tensoradd!(
+                        sreshape(buffer_src[:, i], sz_src), tsrc[f₁, f₂],
+                        ptriv, false, One(), Zero(), backend, allocator
+                    )
                 end
                 buffer_dst = StridedView(buffer1, (blocksize, rows), (1, blocksize), 0)
                 mul!(buffer_dst, buffer_src, transpose(StridedView(U)))
-                for (i, (f₃, f₄)) in enumerate(fusiontrees(dst))
-                    TO.tensoradd!(tdst[f₃, f₄], sreshape(buffer_dst[:, i], sz_src), p, false, α, β, backend, allocator)
+                @inbounds for (i, (f₃, f₄)) in enumerate(fusiontrees(dst))
+                    TO.tensoradd!(
+                        tdst[f₃, f₄], sreshape(buffer_dst[:, i], sz_src),
+                        p, false, α, β, backend, allocator
+                    )
                 end
             end
         end
@@ -582,13 +613,21 @@ function add_transform_kernel!(
     return nothing
 end
 
-# specialization in the case of TensorMap
+# TensorMap specializations: operate directly on the flat data vector to avoid
+# repeated dictionary lookups into t.data. The transformer has precomputed all
+# StridedView descriptors (size, offset, strides) for each fusion tree block.
+# No symmetry types left -- no repeated specialization needed
 function add_transform_kernel!(
         data_dst::DenseVector, data_src::DenseVector, p, transformer::AbelianTreeTransformer,
         α, β, backend, allocator, scheduler
     )
+    # Each entry is (coeff, struct_dst, struct_src) where struct_{dst,src} = (size, offset, strides)
+    # locating the block for one fusion tree pair inside the flat data vector.
     tforeach(transformer.data; scheduler) do (coeff, struct_dst, struct_src)
-        TO.tensoradd!(StridedView(data_dst, struct_dst...), StridedView(data_src, struct_src...), p, false, α * coeff, β, backend, allocator)
+        TO.tensoradd!(
+            StridedView(data_dst, struct_dst...), StridedView(data_src, struct_src...),
+            p, false, α * coeff, β, backend, allocator
+        )
     end
     return nothing
 end
@@ -596,9 +635,14 @@ function add_transform_kernel!(
         data_dst::DenseVector, data_src::DenseVector, p, transformer::GenericTreeTransformer,
         α, β, backend, allocator, scheduler
     )
+    # Each entry covers one fusion block:
+    #   U            — recoupling matrix (rows = dst trees, cols = src trees)
+    #   sz_{dst,src} — array shape of each block (same for all trees in the block)
+    #   structs_{dst,src}[i] — (offset, strides) into the flat data vector for tree i
     tl_buffers = OhMyThreads.TaskLocalValue(() -> allocate_buffers(data_dst, data_src, transformer, allocator))
     tforeach(transformer.data; scheduler) do (U, (sz_dst, structs_dst), (sz_src, structs_src))
         if length(U) == 1
+            # Degenerate block with a single tree: no matmul needed.
             coeff = only(U)
             TO.tensoradd!(
                 StridedView(data_dst, sz_dst, only(structs_dst)...),
@@ -606,18 +650,35 @@ function add_transform_kernel!(
                 p, false, α * coeff, β, backend, allocator
             )
         else
+            # Multi-tree block: pack → recoupling matmul → unpack.
+            # buffer2 = source staging area, buffer1 = destination staging area.
             buffer1, buffer2 = tl_buffers[]
             rows, cols = size(U)
             blocksize = prod(sz_src)
             ptriv = (ntuple(identity, length(sz_src)), ())
+
+            # 1. Extract: copy each source block into column i of buffer_src as a flat vector,
+            #    using a trivial permutation so the layout is canonical before the matmul.
             buffer_src = StridedView(buffer2, (blocksize, cols), (1, blocksize), 0)
-            for (i, struct_src_i) in enumerate(structs_src)
-                TO.tensoradd!(sreshape(buffer_src[:, i], sz_src), StridedView(data_src, sz_src, struct_src_i...), ptriv, false, One(), Zero(), backend, allocator)
+            @inbounds for (i, struct_src_i) in enumerate(structs_src)
+                TO.tensoradd!(
+                    sreshape(buffer_src[:, i], sz_src), StridedView(data_src, sz_src, struct_src_i...),
+                    ptriv, false, One(), Zero(), backend, allocator
+                )
             end
+
+            # 2. Recoupling: buffer_dst = buffer_src * U^T  (each output tree is a linear
+            #    combination of input trees weighted by the recoupling coefficients).
             buffer_dst = StridedView(buffer1, (blocksize, rows), (1, blocksize), 0)
             mul!(buffer_dst, buffer_src, transpose(StridedView(U)))
-            for (i, struct_dst_i) in enumerate(structs_dst)
-                TO.tensoradd!(StridedView(data_dst, sz_dst, struct_dst_i...), sreshape(buffer_dst[:, i], sz_src), p, false, α, β, backend, allocator)
+
+            # 3. Insert: scatter column i of buffer_dst into the destination, applying the
+            #    actual index permutation p in the same tensoradd! call.
+            @inbounds for (i, struct_dst_i) in enumerate(structs_dst)
+                TO.tensoradd!(
+                    StridedView(data_dst, sz_dst, struct_dst_i...), sreshape(buffer_dst[:, i], sz_src),
+                    p, false, α, β, backend, allocator
+                )
             end
         end
     end

From a743f61e35fccc623a8b2ba93c5115570044003e Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Sat, 25 Apr 2026 21:37:03 -0400
Subject: [PATCH 10/33] use updated scalar types

---
 src/tensors/indexmanipulations.jl | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl
index 4170f5fab..10bc331e8 100644
--- a/src/tensors/indexmanipulations.jl
+++ b/src/tensors/indexmanipulations.jl
@@ -15,11 +15,8 @@ for (operation, manipulation) in (
         $promote_op(::Type{T}) where {T <: AbstractTensorMap} =
             $promote_op(scalartype(T), sectortype(T))
         $promote_op(::Type{T}, ::Type{I}) where {T <: Number, I <: Sector} =
-            sectorscalartype(I) <: Integer ? T :
-            sectorscalartype(I) <: Real ? float(T) : complex(T)
-        # TODO: currently the manipulations all use sectorscalartype, change to:
-        # $manipulation_scalartype(I) <: Integer ? T :
-        # $manipulation_scalartype(I) <: Real ? float(T) : complex(T)
+            $manipulation_scalartype(I) <: Integer ? T :
+            $manipulation_scalartype(I) <: Real ? float(T) : complex(T)
     end
 end
 
@@ -369,7 +366,10 @@ If `copy = false`, `tdst` might share data with `tsrc` whenever possible. Otherw
 See [`twist!`](@ref) for storing the result in place.
 """
 function twist(t::AbstractTensorMap, inds; inv::Bool = false, copy::Bool = false)
-    !copy && has_shared_twist(t, inds) && return t
+    if has_shared_twist(t, inds)
+        copy || return t
+        return copy!(similar(t), t)
+    end
     tdst = similar(t, promote_twist(t))
     copy!(tdst, t)
     return twist!(tdst, inds; inv)

From 53e64e98e150a54db3118c65ec095a384636fc4b Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Sat, 25 Apr 2026 21:56:40 -0400
Subject: [PATCH 11/33] reorganization

---
 src/tensors/indexmanipulations.jl | 374 ++++++++++++++++--------------
 1 file changed, 194 insertions(+), 180 deletions(-)

diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl
index 10bc331e8..bfeeb6f82 100644
--- a/src/tensors/indexmanipulations.jl
+++ b/src/tensors/indexmanipulations.jl
@@ -1,25 +1,10 @@
-# Index manipulations
-#---------------------
-
-# find the scalartype after applying operations: take into account fusion and/or braiding
-# might need to become Float or Complex to capture complex recoupling coefficients but don't alter precision
-for (operation, manipulation) in (
-        :flip => :sector, :twist => :braiding,
-        :transpose => :fusion, :permute => :sector, :braid => :sector,
-    )
-    promote_op = Symbol(:promote_, operation)
-    manipulation_scalartype = Symbol(manipulation, :scalartype)
-
-    @eval begin
-        $promote_op(t::AbstractTensorMap) = $promote_op(typeof(t))
-        $promote_op(::Type{T}) where {T <: AbstractTensorMap} =
-            $promote_op(scalartype(T), sectortype(T))
-        $promote_op(::Type{T}, ::Type{I}) where {T <: Number, I <: Sector} =
-            $manipulation_scalartype(I) <: Integer ? T :
-            $manipulation_scalartype(I) <: Real ? float(T) : complex(T)
-    end
-end
+# =============
+#  Reweighting
+# =============
 
+# ------
+# flip
+# ------
 """
     flip(t::AbstractTensorMap, I) -> t′::AbstractTensorMap
 
@@ -43,6 +28,172 @@ function flip(t::AbstractTensorMap, I; inv::Bool = false)
     return t′
 end
 
+# ---------
+# twist(!)
+# ---------
+function has_shared_twist(t, inds)
+    I = sectortype(t)
+    if BraidingStyle(I) == NoBraiding()
+        for i in inds
+            cs = sectors(space(t, i))
+            all(isunit, cs) || throw(SectorMismatch(lazy"Cannot twist sectors $cs"))
+        end
+        return true
+    elseif BraidingStyle(I) == Bosonic()
+        return true
+    else
+        for i in inds
+            cs = sectors(space(t, i))
+            all(isone ∘ twist, cs) || return false
+        end
+        return true
+    end
+end
+
+"""
+    twist!(t::AbstractTensorMap, i::Int; inv::Bool=false) -> t
+    twist!(t::AbstractTensorMap, inds; inv::Bool=false) -> t
+
+Apply a twist to the `i`th index of `t`, or all indices in `inds`, storing the result in `t`.
+If `inv=true`, use the inverse twist.
+
+See [`twist`](@ref) for creating a new tensor.
+"""
+function twist!(t::AbstractTensorMap, inds; inv::Bool = false)
+    if !all(in(allind(t)), inds)
+        msg = "Can't twist indices $inds of a tensor with only $(numind(t)) indices."
+        throw(ArgumentError(msg))
+    end
+    (scalartype(t) <: Real && !(sectorscalartype(sectortype(t)) <: Real)) &&
+        throw(ArgumentError("Can't in-place twist a real tensor with complex sector type"))
+    has_shared_twist(t, inds) && return t
+
+    (scalartype(t) <: Real && !(sectorscalartype(sectortype(t)) <: Real)) &&
+        throw(ArgumentError("No in-place `twist!` for a real tensor with complex sector type"))
+
+    N₁ = numout(t)
+    for (f₁, f₂) in fusiontrees(t)
+        θ = prod(i -> i <= N₁ ? twist(f₁.uncoupled[i]) : twist(f₂.uncoupled[i - N₁]), inds)
+        inv && (θ = θ')
+        scale!(t[f₁, f₂], θ)
+    end
+    return t
+end
+
+"""
+    twist(tsrc::AbstractTensorMap, i::Int; inv::Bool = false, copy::Bool = false) -> tdst
+    twist(tsrc::AbstractTensorMap, inds; inv::Bool = false, copy::Bool = false) -> tdst
+
+Apply a twist to the `i`th index of `tsrc` and return the result as a new tensor.
+If `inv = true`, use the inverse twist.
+If `copy = false`, `tdst` might share data with `tsrc` whenever possible. Otherwise, a copy is always made.
+
+See [`twist!`](@ref) for storing the result in place.
+"""
+function twist(t::AbstractTensorMap, inds; inv::Bool = false, copy::Bool = false)
+    if has_shared_twist(t, inds)
+        copy || return t
+        return copy!(similar(t), t)
+    end
+    tdst = similar(t, promote_twist(t))
+    copy!(tdst, t)
+    return twist!(tdst, inds; inv)
+end
+
+# =========================
+#  Space insertion/removal
+# =========================
+
+# Methods which change the number of indices, implement using `Val(i)` for type inference
+"""
+    insertleftunit(tsrc::AbstractTensorMap, i=numind(t) + 1;
+                   conj=false, dual=false, copy=false) -> tdst
+
+Insert a trivial vector space, isomorphic to the underlying field, at position `i`,
+which can be specified as an `Int` or as `Val(i)` for improved type stability.
+More specifically, adds a left monoidal unit or its dual.
+
+If `copy=false`, `tdst` might share data with `tsrc` whenever possible. Otherwise, a copy is always made.
+
+See also [`insertrightunit`](@ref insertrightunit(::AbstractTensorMap, ::Val{i}) where {i}),
+[`removeunit`](@ref removeunit(::AbstractTensorMap, ::Val{i}) where {i}).
+"""
+function insertleftunit(
+        t::AbstractTensorMap, ::Val{i} = Val(numind(t) + 1);
+        copy::Bool = false, conj::Bool = false, dual::Bool = false
+    ) where {i}
+    W = insertleftunit(space(t), Val(i); conj, dual)
+    if t isa TensorMap
+        return TensorMap{scalartype(t)}(copy ? Base.copy(t.data) : t.data, W)
+    else
+        tdst = similar(t, W)
+        for (c, b) in blocks(t)
+            copy!(block(tdst, c), b)
+        end
+        return tdst
+    end
+end
+
+"""
+    insertrightunit(tsrc::AbstractTensorMap, i=numind(t);
+                    conj=false, dual=false, copy=false) -> tdst
+
+Insert a trivial vector space, isomorphic to the underlying field, after position `i`,
+which can be specified as an `Int` or as `Val(i)` for improved type stability.
+More specifically, adds a right monoidal unit or its dual.
+
+If `copy=false`, `tdst` might share data with `tsrc` whenever possible. Otherwise, a copy is always made.
+
+See also [`insertleftunit`](@ref insertleftunit(::AbstractTensorMap, ::Val{i}) where {i}),
+[`removeunit`](@ref removeunit(::AbstractTensorMap, ::Val{i}) where {i}).
+"""
+function insertrightunit(
+        t::AbstractTensorMap, ::Val{i} = Val(numind(t));
+        copy::Bool = false, conj::Bool = false, dual::Bool = false
+    ) where {i}
+    W = insertrightunit(space(t), Val(i); conj, dual)
+    if t isa TensorMap
+        return TensorMap{scalartype(t)}(copy ? Base.copy(t.data) : t.data, W)
+    else
+        tdst = similar(t, W)
+        for (c, b) in blocks(t)
+            copy!(block(tdst, c), b)
+        end
+        return tdst
+    end
+end
+
+"""
+    removeunit(tsrc::AbstractTensorMap, i; copy=false) -> tdst
+
+This removes a trivial tensor product factor at position `1 ≤ i ≤ N`, where `i`
+can be specified as an `Int` or as `Val(i)` for improved type stability.
+For this to work, that factor has to be isomorphic to the field of scalars.
+
+If `copy=false`, `tdst` might share data with `tsrc` whenever possible. Otherwise, a copy is always made.
+
+This operation undoes the work of [`insertleftunit`](@ref insertleftunit(::AbstractTensorMap, ::Val{i}) where {i})
+and [`insertrightunit`](@ref insertrightunit(::AbstractTensorMap, ::Val{i}) where {i}).
+"""
+function removeunit(t::AbstractTensorMap, ::Val{i}; copy::Bool = false) where {i}
+    W = removeunit(space(t), Val(i))
+    if t isa TensorMap
+        return TensorMap{scalartype(t)}(copy ? Base.copy(t.data) : t.data, W)
+    else
+        tdst = similar(t, W)
+        for (c, b) in blocks(t)
+            copy!(block(tdst, c), b)
+        end
+        return tdst
+    end
+end
+
+# TODO: fusion/splitting of indices
+
+# ============================
+# Index rearrangements
+# ============================
+
 # --------------
 #   permute(!)
 # --------------
@@ -61,8 +212,8 @@ See also [`permute`](@ref) for creating a new tensor.
         backend::AbstractBackend = TO.DefaultBackend(), allocator = TO.DefaultAllocator()
     )
     @boundscheck spacecheck_transform(permute, tdst, tsrc, p)
-    transformer = treepermuter(tdst, tsrc, p)
-    return @inbounds add_transform!(tdst, tsrc, p, transformer, α, β, backend, allocator)
+    levels = ntuple(identity, numind(tsrc))
+    return @inbounds braid!(tdst, tsrc, p, levels, α, β, backend, allocator)
 end
 
 """
@@ -92,7 +243,8 @@ function permute(
 
     # general case
     tdst = similar(t, promote_permute(t), permute(space(t), p))
-    return @inbounds permute!(tdst, t, p, One(), Zero(), backend, allocator)
+    levels = ntuple(identity, numind(t))
+    return @inbounds braid!(tdst, t, p, levels, One(), Zero(), backend, allocator)
 end
 function permute(t::AdjointTensorMap, (p₁, p₂)::Index2Tuple; kwargs...)
     p₁′ = adjointtensorindices(t, p₂)
@@ -169,7 +321,6 @@ function braid(
     )
     length(levels) == numind(t) || throw(ArgumentError(lazy"length of levels should be $(numind(t)), got $(length(levels))"))
 
-    BraidingStyle(sectortype(t)) isa SymmetricBraiding && return permute(t, p; copy, backend, allocator)
     (!copy && p == (codomainind(t), domainind(t))) && return t
 
     # general case
@@ -305,166 +456,29 @@ See also [`repartition!`](@ref) for writing into an existing destination.
     return transpose(t, (p₁, p₂); copy, backend, allocator)
 end
 
-# Twist
-function has_shared_twist(t, inds)
-    I = sectortype(t)
-    if BraidingStyle(I) == NoBraiding()
-        for i in inds
-            cs = sectors(space(t, i))
-            all(isunit, cs) || throw(SectorMismatch(lazy"Cannot twist sectors $cs"))
-        end
-        return true
-    elseif BraidingStyle(I) == Bosonic()
-        return true
-    else
-        for i in inds
-            cs = sectors(space(t, i))
-            all(isone ∘ twist, cs) || return false
-        end
-        return true
-    end
-end
-
-"""
-    twist!(t::AbstractTensorMap, i::Int; inv::Bool=false) -> t
-    twist!(t::AbstractTensorMap, inds; inv::Bool=false) -> t
-
-Apply a twist to the `i`th index of `t`, or all indices in `inds`, storing the result in `t`.
-If `inv=true`, use the inverse twist.
-
-See [`twist`](@ref) for creating a new tensor.
-"""
-function twist!(t::AbstractTensorMap, inds; inv::Bool = false)
-    if !all(in(allind(t)), inds)
-        msg = "Can't twist indices $inds of a tensor with only $(numind(t)) indices."
-        throw(ArgumentError(msg))
-    end
-    (scalartype(t) <: Real && !(sectorscalartype(sectortype(t)) <: Real)) &&
-        throw(ArgumentError("Can't in-place twist a real tensor with complex sector type"))
-    has_shared_twist(t, inds) && return t
-
-    (scalartype(t) <: Real && !(sectorscalartype(sectortype(t)) <: Real)) &&
-        throw(ArgumentError("No in-place `twist!` for a real tensor with complex sector type"))
-
-    N₁ = numout(t)
-    for (f₁, f₂) in fusiontrees(t)
-        θ = prod(i -> i <= N₁ ? twist(f₁.uncoupled[i]) : twist(f₂.uncoupled[i - N₁]), inds)
-        inv && (θ = θ')
-        scale!(t[f₁, f₂], θ)
-    end
-    return t
-end
-
-"""
-    twist(tsrc::AbstractTensorMap, i::Int; inv::Bool = false, copy::Bool = false) -> tdst
-    twist(tsrc::AbstractTensorMap, inds; inv::Bool = false, copy::Bool = false) -> tdst
-
-Apply a twist to the `i`th index of `tsrc` and return the result as a new tensor.
-If `inv = true`, use the inverse twist.
-If `copy = false`, `tdst` might share data with `tsrc` whenever possible. Otherwise, a copy is always made.
-
-See [`twist!`](@ref) for storing the result in place.
-"""
-function twist(t::AbstractTensorMap, inds; inv::Bool = false, copy::Bool = false)
-    if has_shared_twist(t, inds)
-        copy || return t
-        return copy!(similar(t), t)
-    end
-    tdst = similar(t, promote_twist(t))
-    copy!(tdst, t)
-    return twist!(tdst, inds; inv)
-end
-
-# Methods which change the number of indices, implement using `Val(i)` for type inference
-"""
-    insertleftunit(tsrc::AbstractTensorMap, i=numind(t) + 1;
-                   conj=false, dual=false, copy=false) -> tdst
-
-Insert a trivial vector space, isomorphic to the underlying field, at position `i`,
-which can be specified as an `Int` or as `Val(i)` for improved type stability.
-More specifically, adds a left monoidal unit or its dual.
-
-If `copy=false`, `tdst` might share data with `tsrc` whenever possible. Otherwise, a copy is always made.
-
-See also [`insertrightunit`](@ref insertrightunit(::AbstractTensorMap, ::Val{i}) where {i}),
-[`removeunit`](@ref removeunit(::AbstractTensorMap, ::Val{i}) where {i}).
-"""
-function insertleftunit(
-        t::AbstractTensorMap, ::Val{i} = Val(numind(t) + 1);
-        copy::Bool = false, conj::Bool = false, dual::Bool = false
-    ) where {i}
-    W = insertleftunit(space(t), Val(i); conj, dual)
-    if t isa TensorMap
-        return TensorMap{scalartype(t)}(copy ? Base.copy(t.data) : t.data, W)
-    else
-        tdst = similar(t, W)
-        for (c, b) in blocks(t)
-            copy!(block(tdst, c), b)
-        end
-        return tdst
-    end
-end
-
-"""
-    insertrightunit(tsrc::AbstractTensorMap, i=numind(t);
-                    conj=false, dual=false, copy=false) -> tdst
-
-Insert a trivial vector space, isomorphic to the underlying field, after position `i`,
-which can be specified as an `Int` or as `Val(i)` for improved type stability.
-More specifically, adds a right monoidal unit or its dual.
-
-If `copy=false`, `tdst` might share data with `tsrc` whenever possible. Otherwise, a copy is always made.
-
-See also [`insertleftunit`](@ref insertleftunit(::AbstractTensorMap, ::Val{i}) where {i}),
-[`removeunit`](@ref removeunit(::AbstractTensorMap, ::Val{i}) where {i}).
-"""
-function insertrightunit(
-        t::AbstractTensorMap, ::Val{i} = Val(numind(t));
-        copy::Bool = false, conj::Bool = false, dual::Bool = false
-    ) where {i}
-    W = insertrightunit(space(t), Val(i); conj, dual)
-    if t isa TensorMap
-        return TensorMap{scalartype(t)}(copy ? Base.copy(t.data) : t.data, W)
-    else
-        tdst = similar(t, W)
-        for (c, b) in blocks(t)
-            copy!(block(tdst, c), b)
-        end
-        return tdst
-    end
-end
-
-"""
-    removeunit(tsrc::AbstractTensorMap, i; copy=false) -> tdst
-
-This removes a trivial tensor product factor at position `1 ≤ i ≤ N`, where `i`
-can be specified as an `Int` or as `Val(i)` for improved type stability.
-For this to work, that factor has to be isomorphic to the field of scalars.
+#-------------------------------------
+# Internal implementations
+#-------------------------------------
 
-If `copy=false`, `tdst` might share data with `tsrc` whenever possible. Otherwise, a copy is always made.
+# find the scalartype after applying operations: take into account fusion and/or braiding
+# might need to become Float or Complex to capture complex recoupling coefficients but don't alter precision
+for (operation, manipulation) in (
+        :flip => :sector, :twist => :braiding,
+        :transpose => :fusion, :permute => :sector, :braid => :sector,
+    )
+    promote_op = Symbol(:promote_, operation)
+    manipulation_scalartype = Symbol(manipulation, :scalartype)
 
-This operation undoes the work of [`insertleftunit`](@ref insertleftunit(::AbstractTensorMap, ::Val{i}) where {i})
-and [`insertrightunit`](@ref insertrightunit(::AbstractTensorMap, ::Val{i}) where {i}).
-"""
-function removeunit(t::AbstractTensorMap, ::Val{i}; copy::Bool = false) where {i}
-    W = removeunit(space(t), Val(i))
-    if t isa TensorMap
-        return TensorMap{scalartype(t)}(copy ? Base.copy(t.data) : t.data, W)
-    else
-        tdst = similar(t, W)
-        for (c, b) in blocks(t)
-            copy!(block(tdst, c), b)
-        end
-        return tdst
+    @eval begin
+        $promote_op(t::AbstractTensorMap) = $promote_op(typeof(t))
+        $promote_op(::Type{T}) where {T <: AbstractTensorMap} =
+            $promote_op(scalartype(T), sectortype(T))
+        $promote_op(::Type{T}, ::Type{I}) where {T <: Number, I <: Sector} =
+            $manipulation_scalartype(I) <: Integer ? T :
+            $manipulation_scalartype(I) <: Real ? float(T) : complex(T)
     end
 end
 
-# Fusing and splitting
-# TODO: add functionality for easy fusing and splitting of tensor indices
-
-#-------------------------------------
-# Full implementations based on `add`
-#-------------------------------------
 spacecheck_transform(f, tdst::AbstractTensorMap, tsrc::AbstractTensorMap, args...) =
     spacecheck_transform(f, space(tdst), space(tsrc), args...)
 @noinline function spacecheck_transform(f, Vdst::TensorMapSpace, Vsrc::TensorMapSpace, p::Index2Tuple)

From 6751f26198c4d15b2b92726e1472a1c66bf6be43 Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Sat, 25 Apr 2026 21:56:52 -0400
Subject: [PATCH 12/33] all permutes go through braid

---
 src/tensors/treetransformers.jl | 33 ++++++++++++++-------------------
 1 file changed, 14 insertions(+), 19 deletions(-)

diff --git a/src/tensors/treetransformers.jl b/src/tensors/treetransformers.jl
index 82032b067..b602e11ea 100644
--- a/src/tensors/treetransformers.jl
+++ b/src/tensors/treetransformers.jl
@@ -136,21 +136,21 @@ end
 
 function allocate_buffers(
         tdst::TensorMap, tsrc::TensorMap, transformer::GenericTreeTransformer,
-        allocator=TO.DefaultAllocator()
+        allocator = TO.DefaultAllocator()
     )
     sz = buffersize(transformer)
     return similar(tdst.data, sz), similar(tsrc.data, sz)
 end
 function allocate_buffers(
         data_dst::DenseVector, data_src::DenseVector, transformer::GenericTreeTransformer,
-        allocator=TO.DefaultAllocator()
+        allocator = TO.DefaultAllocator()
     )
     sz = buffersize(transformer)
     return similar(data_dst, sz), similar(data_src, sz)
 end
 function allocate_buffers(
         tdst::AbstractTensorMap, tsrc::AbstractTensorMap, transformer,
-        allocator=TO.DefaultAllocator()
+        allocator = TO.DefaultAllocator()
     )
     # be pessimistic and assume the worst for now
     sz = dim(space(tsrc))
@@ -194,22 +194,17 @@ end
     return TreeTransformer(fusiontreebraider, p, Vdst, Vsrc)
 end
 
-for (transform, treetransformer) in
-    ((:permute, :treepermuter), (:transpose, :treetransposer))
-    @eval begin
-        function $treetransformer(::AbstractTensorMap, ::AbstractTensorMap, p::Index2Tuple)
-            return fusiontreetransform(f) = $transform(f, p)
-        end
-        function $treetransformer(tdst::TensorMap, tsrc::TensorMap, p::Index2Tuple)
-            return $treetransformer(space(tdst), space(tsrc), p)
-        end
-        @cached function $treetransformer(
-                Vdst::TensorMapSpace, Vsrc::TensorMapSpace, p::Index2Tuple
-            )::treetransformertype(Vdst, Vsrc)
-            fusiontreetransform(f) = $transform(f, p)
-            return TreeTransformer(fusiontreetransform, p, Vdst, Vsrc)
-        end
-    end
+function treetransposer(::AbstractTensorMap, ::AbstractTensorMap, p::Index2Tuple)
+    return fusiontreetransform(f) = transpose(f, p)
+end
+function treetransposer(tdst::TensorMap, tsrc::TensorMap, p::Index2Tuple)
+    return treetransposer(space(tdst), space(tsrc), p)
+end
+@cached function treetransposer(
+        Vdst::TensorMapSpace, Vsrc::TensorMapSpace, p::Index2Tuple
+    )::treetransformertype(Vdst, Vsrc)
+    fusiontreetransform(f) = transpose(f, p)
+    return TreeTransformer(fusiontreetransform, p, Vdst, Vsrc)
 end
 
 # default cachestyle is GlobalLRUCache

From 99ec03edc641f9c54e7387d2a8b41760bd31691b Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Sat, 25 Apr 2026 21:59:46 -0400
Subject: [PATCH 13/33] format docstrings

---
 src/tensors/indexmanipulations.jl | 36 ++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 13 deletions(-)

diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl
index bfeeb6f82..4a1cfc997 100644
--- a/src/tensors/indexmanipulations.jl
+++ b/src/tensors/indexmanipulations.jl
@@ -51,8 +51,8 @@ function has_shared_twist(t, inds)
 end
 
 """
-    twist!(t::AbstractTensorMap, i::Int; inv::Bool=false) -> t
-    twist!(t::AbstractTensorMap, inds; inv::Bool=false) -> t
+    twist!(t::AbstractTensorMap, i::Int; inv::Bool = false) -> t
+    twist!(t::AbstractTensorMap, inds; inv::Bool = false) -> t
 
 Apply a twist to the `i`th index of `t`, or all indices in `inds`, storing the result in `t`.
 If `inv=true`, use the inverse twist.
@@ -106,8 +106,10 @@ end
 
 # Methods which change the number of indices, implement using `Val(i)` for type inference
 """
-    insertleftunit(tsrc::AbstractTensorMap, i=numind(t) + 1;
-                   conj=false, dual=false, copy=false) -> tdst
+    insertleftunit(
+            tsrc::AbstractTensorMap, i = numind(t) + 1;
+            conj = false, dual = false, copy = false
+        ) -> tdst
 
 Insert a trivial vector space, isomorphic to the underlying field, at position `i`,
 which can be specified as an `Int` or as `Val(i)` for improved type stability.
@@ -135,8 +137,10 @@ function insertleftunit(
 end
 
 """
-    insertrightunit(tsrc::AbstractTensorMap, i=numind(t);
-                    conj=false, dual=false, copy=false) -> tdst
+    insertrightunit(
+            tsrc::AbstractTensorMap, i = numind(t);
+            conj = false, dual = false, copy = false
+        ) -> tdst
 
 Insert a trivial vector space, isomorphic to the underlying field, after position `i`,
 which can be specified as an `Int` or as `Val(i)` for improved type stability.
@@ -164,7 +168,7 @@ function insertrightunit(
 end
 
 """
-    removeunit(tsrc::AbstractTensorMap, i; copy=false) -> tdst
+    removeunit(tsrc::AbstractTensorMap, i; copy = false) -> tdst
 
 This removes a trivial tensor product factor at position `1 ≤ i ≤ N`, where `i`
 can be specified as an `Int` or as `Val(i)` for improved type stability.
@@ -302,8 +306,10 @@ See also [`braid`](@ref) for creating a new tensor.
 end
 
 """
-    braid(tsrc, (p₁, p₂)::Index2Tuple, levels::IndexTuple; copy=false,
-          backend=DefaultBackend(), allocator=DefaultAllocator()) -> tdst::TensorMap
+    braid(
+            tsrc, (p₁, p₂)::Index2Tuple, levels::IndexTuple; copy = false,
+            backend = DefaultBackend(), allocator = DefaultAllocator()
+        ) -> tdst::TensorMap
 
 Return tensor `tdst` obtained by braiding the indices of `tsrc`.
 The codomain and domain of `tdst` correspond to the indices in `p₁` and `p₂` of `tsrc` respectively.
@@ -369,8 +375,10 @@ end
 end
 
 """
-    transpose(tsrc, (p₁, p₂)::Index2Tuple; copy=false,
-              backend=DefaultBackend(), allocator=DefaultAllocator()) -> tdst::TensorMap
+    transpose(
+            tsrc, (p₁, p₂)::Index2Tuple; copy = false,
+            backend = DefaultBackend(), allocator = DefaultAllocator()
+        ) -> tdst::TensorMap
 
 Return tensor `tdst` obtained by transposing the indices of `tsrc`.
 The codomain and domain of `tdst` correspond to the indices in `p₁` and `p₂` of `tsrc` respectively.
@@ -432,8 +440,10 @@ See also [`repartition`](@ref) for creating a new tensor.
 end
 
 """
-    repartition(tsrc, N₁::Int, N₂::Int=numind(tsrc)-N₁; copy=false,
-                backend=DefaultBackend(), allocator=DefaultAllocator()) -> tdst
+    repartition(
+            tsrc, N₁::Int, N₂::Int = numind(tsrc) - N₁; copy = false,
+            backend = DefaultBackend(), allocator = DefaultAllocator()
+        ) -> tdst
 
 Return tensor `tdst` obtained by repartitioning the indices of `tsrc`.
 The codomain and domain of `tdst` correspond to the first `N₁` and last `N₂` spaces of `tsrc`,

From 432d8e454d6227a180fefc23bd41758439a1eaab Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Sat, 25 Apr 2026 22:22:19 -0400
Subject: [PATCH 14/33] update docs

---
 docs/make.jl                        |   4 +-
 docs/src/lib/tensors.md             |   6 --
 docs/src/man/indexmanipulations.md  | 107 ++++++++++++++++++++++++++++
 docs/src/man/tensormanipulations.md | 103 +-------------------------
 4 files changed, 111 insertions(+), 109 deletions(-)
 create mode 100644 docs/src/man/indexmanipulations.md

diff --git a/docs/make.jl b/docs/make.jl
index 34b025580..5517755ab 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -9,7 +9,7 @@ end
 using Documenter
 using Random
 using TensorKit
-using TensorKit: FusionTreePair, FusionTreeBlock, Index2Tuple
+using TensorKit: FusionTreePair, FusionTreeBlock, Index2Tuple, IndexTuple
 using TensorKit.TensorKitSectors
 using TensorKit.MatrixAlgebraKit
 using DocumenterInterLinks
@@ -27,7 +27,7 @@ pages = [
         "man/spaces.md", "man/symmetries.md",
         "man/sectors.md", "man/gradedspaces.md",
         "man/fusiontrees.md", "man/tensors.md",
-        "man/tensormanipulations.md",
+        "man/indexmanipulations.md", "man/tensormanipulations.md",
     ],
     "Library" => [
         "lib/sectors.md", "lib/fusiontrees.md",
diff --git a/docs/src/lib/tensors.md b/docs/src/lib/tensors.md
index b19537e3f..22e4c2c1b 100644
--- a/docs/src/lib/tensors.md
+++ b/docs/src/lib/tensors.md
@@ -184,12 +184,6 @@ repartition!
 twist!
 ```
 
-```@docs
-TensorKit.add_permute!
-TensorKit.add_braid!
-TensorKit.add_transpose!
-```
-
 ### Tensor map composition, traces, contractions and tensor products
 
 ```@docs
diff --git a/docs/src/man/indexmanipulations.md b/docs/src/man/indexmanipulations.md
new file mode 100644
index 000000000..15908d02f
--- /dev/null
+++ b/docs/src/man/indexmanipulations.md
@@ -0,0 +1,107 @@
+# [Index manipulations](@id s_indexmanipulations)
+
+```@setup indexmanip
+using TensorKit
+using LinearAlgebra
+```
+
+Tensor maps have a bipartition of their indices into a codomain and a domain.
+Index manipulations are operations that reorganize this structure: reordering indices, moving them between domain and codomain, flipping arrows, applying twists, or inserting and removing trivial factors.
+
+Throughout this page, index positions are specified using `Index2Tuple{N₁,N₂}`, i.e. a pair `(p₁, p₂)` of tuples.
+The indices in `p₁` form the new codomain, and those in `p₂` form the new domain.
+The helper functions [`codomainind`](@ref), [`domainind`](@ref), [`allind`](@ref), [`numout`](@ref) and [`numin`](@ref) are available to retrieve the current index structure of a tensor.
+
+## Permuting and braiding
+
+For sector types with a symmetric braiding (`BraidingStyle(I) isa SymmetricBraiding`), use [`permute`](@ref):
+
+```@docs; canonical=false
+permute(::AbstractTensorMap, ::Index2Tuple)
+permute!(::AbstractTensorMap, ::AbstractTensorMap, ::Index2Tuple)
+```
+
+For general braiding, use [`braid`](@ref), which requires an additional `levels` argument that assigns a height to each index.
+When two indices need to exchange places, the index with the higher level crosses over the index with the lower level.
+
+```@docs; canonical=false
+braid(::AbstractTensorMap, ::Index2Tuple, ::IndexTuple)
+braid!
+```
+
+For plain tensors (`sectortype(t) == Trivial`), `permute` acts exactly like `permutedims` on the underlying array data:
+
+```@repl indexmanip
+V = ℂ^2;
+t = randn(V ⊗ V ← V ⊗ V);
+ta = convert(Array, t);
+t′ = permute(t, ((4, 2, 3), (1,)));
+convert(Array, t′) ≈ permutedims(ta, (4, 2, 3, 1))
+```
+
+## Transposing and repartitioning
+
+[`transpose`](@ref) is a special case of braiding restricted to *cyclic permutations*, i.e. permutations where indices do not cross.
+Unlike a generic `braid`, it introduces a compensating (inverse) twist, which is necessary to satisfy the categorical definition of transpose.
+
+```@raw html
+<img src="../img/tensor-transpose.svg" alt="transpose" class="color-invertible"/>
+```
+
+```@docs; canonical=false
+transpose(::AbstractTensorMap, ::Index2Tuple)
+transpose!
+```
+
+[`repartition`](@ref) is a further special case that only changes the codomain/domain split while preserving cyclic order:
+
+```@docs; canonical=false
+repartition(::AbstractTensorMap, ::Int, ::Int)
+repartition!
+```
+
+## Flipping arrows
+
+[`flip`](@ref) applies an isomorphism to change the arrow direction on selected indices:
+
+```@docs; canonical=false
+flip(t::AbstractTensorMap, I)
+```
+
+!!! note
+    `flip` is not involutory: `flip(flip(t, I), I) ≠ t` in general.
+    Use `flip(flip(t, I), I; inv=true)` to recover the original tensor.
+
+## Twisting
+
+[`twist`](@ref) applies the monoidal twist to one or more indices.
+For `BraidingStyle(I) == Bosonic()`, all twists are trivial and `twist` returns the tensor unchanged.
+
+```@docs; canonical=false
+twist(::AbstractTensorMap, ::Int)
+twist!
+```
+
+## Inserting and removing unit spaces
+
+The following functions insert or remove a trivial tensor product factor (a space isomorphic to the scalar field) at a given position.
+Passing `Val(i)` instead of an integer `i` improves type stability.
+
+```@docs; canonical=false
+insertleftunit(::AbstractTensorMap, ::Val{i}) where {i}
+insertrightunit(::AbstractTensorMap, ::Val{i}) where {i}
+removeunit(::AbstractTensorMap, ::Val{i}) where {i}
+```
+
+## Fusing and splitting indices
+
+There is no dedicated function for fusing or splitting indices.
+For a plain tensor (`sectortype(t) == Trivial`), this is equivalent to `reshape` on the underlying array.
+In the general case, one can construct an explicit isomorphism using [`isomorphism`](@ref) (or [`unitary`](@ref) for Euclidean spaces) and contract it with the tensor:
+
+```julia
+u = unitary(fuse(space(t, i) ⊗ space(t, j)), space(t, i) ⊗ space(t, j))
+# then contract u with indices i and j of t via @tensor
+```
+
+Note that tensor factorizations (SVD, QR, etc.) can be applied directly to any index bipartition without needing to fuse indices first; see [Tensor factorizations](@ref ss_tensor_factorization).
diff --git a/docs/src/man/tensormanipulations.md b/docs/src/man/tensormanipulations.md
index 15285fd78..2f238f963 100644
--- a/docs/src/man/tensormanipulations.md
+++ b/docs/src/man/tensormanipulations.md
@@ -83,107 +83,8 @@ norm(t8) ≈ norm(t4)*norm(t6)
 
 ## [Index manipulations](@id ss_indexmanipulation)
 
-In many cases, the bipartition of tensor indices (i.e. `ElementarySpace` instances) between the codomain and domain is not fixed throughout the different operations that need to be performed on that tensor map, i.e. we want to use the duality to move spaces from domain to codomain and vice versa.
-Furthermore, we want to use the braiding to reshuffle the order of the indices.
-
-For this, we use an interface that is closely related to that for manipulating splitting- fusion tree pairs, namely [`braid`](@ref) and [`permute`](@ref), with the interface
-
-```julia
-braid(t::AbstractTensorMap{T,S,N₁,N₂}, (p1, p2)::Index2Tuple{N₁′,N₂′}, levels::IndexTuple{N₁+N₂,Int})
-```
-
-and
-
-```julia
-permute(t::AbstractTensorMap{T,S,N₁,N₂}, (p1, p2)::Index2Tuple{N₁′,N₂′}; copy = false)
-```
-
-both of which return an instance of `AbstractTensorMap{T, S, N₁′, N₂′}`.
-
-In these methods, `p1` and `p2` specify which of the original tensor indices ranging from `1` to `N₁ + N₂` make up the new codomain (with `N₁′` spaces) and new domain (with `N₂′` spaces).
-Hence, `(p1..., p2...)` should be a valid permutation of `1:(N₁ + N₂)`.
-Note that, throughout TensorKit.jl, permutations are always specified using tuples of `Int`s, for reasons of type stability.
-For `braid`, we also need to specify `levels` or depths for each of the indices of the original tensor, which determine whether indices will braid over or underneath each other (use the braiding or its inverse).
-We refer to the section on [manipulating fusion trees](@ref ss_fusiontrees) for more details.
-
-When `BraidingStyle(sectortype(t)) isa SymmetricBraiding`, we can use the simpler interface of `permute`, which does not require the argument `levels`.
-`permute` accepts a keyword argument `copy`.
-When `copy == true`, the result will be a tensor with newly allocated data that can independently be modified from that of the input tensor `t`.
-When `copy` takes the default value `false`, `permute` can try to return the result in a way that it shares its data with the input tensor `t`, though this is only possible in specific cases (e.g. when `sectortype(S) == Trivial` and `(p1..., p2...) = (1:(N₁+N₂)...)`).
-
-Both `braid` and `permute` come in a version where the result is stored in an already existing tensor, i.e. [`braid!(tdst, tsrc, (p1, p2), levels)`](@ref) and [`permute!(tdst, tsrc, (p1, p2))`](@ref).
-
-Another operation that belongs under index manipulations is taking the `transpose` of a tensor, i.e. `LinearAlgebra.transpose(t)` and `LinearAlgebra.transpose!(tdst, tsrc)`, both of which are reexported by TensorKit.jl.
-Note that `transpose(t)` is not simply equal to reshuffling domain and codomain with `braid(t, (1:(N₁+N₂)...), reverse(domainind(tsrc)), reverse(codomainind(tsrc))))`.
-Indeed, the graphical representation (where we draw the codomain and domain as a single object), makes clear that this introduces an additional (inverse) twist, which is then compensated in the `transpose` implementation.
-
-```@raw html
-<img src="../img/tensor-transpose.svg" alt="transpose" class="color-invertible"/>
-```
-
-In categorical language, the reason for this extra twist is that we use the left coevaluation ``η``, but the right evaluation ``\tilde{ϵ}``, when repartitioning the indices between domain and codomain.
-
-There are a number of other index related manipulations.
-We can apply a twist (or inverse twist) to one of the tensor map indices via [`twist(t, i; inv = false)`](@ref) or [`twist!(t, i; inv = false)`](@ref).
-Note that the latter method does not store the result in a new destination tensor, but just modifies the tensor `t` in place.
-Twisting several indices simultaneously can be obtained by using the defining property
-
-```math
-θ_{V⊗W} = τ_{W,V} ∘ (θ_W ⊗ θ_V) ∘ τ_{V,W} = (θ_V ⊗ θ_W) ∘ τ_{W,V} ∘ τ_{V,W},
-```
-
-but is currently not implemented explicitly.
-
-For all sector types `I` with `BraidingStyle(I) == Bosonic()`, all twists are `1` and thus have no effect.
-Let us start with some examples, in which we illustrate that, albeit `permute` might act highly non-trivial on the fusion trees and on the corresponding data, after conversion to a regular `Array` (when possible), it just acts like `permutedims`
-
-```@repl tensors
-domain(t) → codomain(t)
-ta = convert(Array, t);
-t′ = permute(t, (1, 2, 3, 4));
-domain(t′) → codomain(t′)
-convert(Array, t′) ≈ ta
-t′′ = permute(t, ((4, 2, 3), (1,)));
-domain(t′′) → codomain(t′′)
-convert(Array, t′′) ≈ permutedims(ta, (4, 2, 3, 1))
-transpose(t)
-convert(Array, transpose(t)) ≈ permutedims(ta, (4, 3, 2, 1))
-dot(t2, t) ≈ dot(transpose(t2), transpose(t))
-transpose(transpose(t)) ≈ t
-twist(t, 3) ≈ t
-```
-
-Note that `transpose` acts like one would expect on a `TensorMap{T, S, 1, 1}`.
-On a `TensorMap{T, S, N₁, N₂}`, because `transpose` replaces the codomain with the dual of the domain, which has its tensor product operation reversed, this in the end amounts in a complete reversal of all tensor indices when representing it as a plain multi-dimensional `Array`.
-Also, note that we have not defined the conjugation of `TensorMap` instances.
-One definition that one could think of is `conj(t) = adjoint(transpose(t))`.
-However note that `codomain(adjoint(tranpose(t))) == domain(transpose(t)) == dual(codomain(t))` and similarly `domain(adjoint(tranpose(t))) == dual(domain(t))`, where `dual` of a `ProductSpace` is composed of the dual of the `ElementarySpace` instances, in reverse order of tensor product.
-This might be very confusing, and as such we leave tensor conjugation undefined.
-However, note that we have a conjugation syntax within the context of [tensor contractions](@ref ss_tensor_contraction).
-
-To show the effect of `twist`, we now consider a type of sector `I` for which `BraidingStyle(I) != Bosonic()`.
-In particular, we use `FibonacciAnyon`.
-We cannot convert the resulting `TensorMap` to an `Array`, so we have to rely on indirect tests to verify our results.
-
-```@repl tensors
-V1 = GradedSpace{FibonacciAnyon}(:I => 3, :τ => 2)
-V2 = GradedSpace{FibonacciAnyon}(:I => 2, :τ => 1)
-m = randn(Float32, V1, V2)
-transpose(m)
-twist(braid(m, ((2,), (1,)), (1, 2)), 1)
-t1 = randn(V1 * V2', V2 * V1);
-t2 = randn(ComplexF64, V1 * V2', V2 * V1);
-dot(t1, t2) ≈ dot(transpose(t1), transpose(t2))
-transpose(transpose(t1)) ≈ t1
-```
-
-A final operation that one might expect in this section is to fuse or join indices, and its inverse, to split a given index into two or more indices.
-For a plain tensor (i.e. with `sectortype(t) == Trivial`) amount to the equivalent of `reshape` on the multidimensional data.
-However, this represents only one possibility, as there is no canonically unique way to embed the tensor product of two spaces `V1 ⊗ V2` in a new space `V = fuse(V1 ⊗ V2)`.
-Such a mapping can always be accompagnied by a basis transform.
-However, one particular choice is created by the function `isomorphism`, or for `EuclideanProduct` spaces, `unitary`.
-Hence, we can join or fuse two indices of a tensor by first constructing `u = unitary(fuse(space(t, i) ⊗ space(t, j)), space(t, i) ⊗ space(t, j))` and then contracting this map with indices `i` and `j` of `t`, as explained in the section on [contracting tensors](@ref ss_tensor_contraction).
-Note, however, that a typical algorithm is not expected to often need to fuse and split indices, as e.g. tensor factorizations can easily be applied without needing to `reshape` or fuse indices first, as explained in the next section.
+Index manipulations are operations that reorganize the bipartition of indices between the codomain and domain, possibly also reordering them or applying braiding isomorphisms.
+They are covered in detail on a dedicated page: [Index manipulations](@ref s_indexmanipulations).
 
 ## [Tensor factorizations](@id ss_tensor_factorization)
 

From 75fdfc105fbbe2d8c826e80a08cf96dfc7f200ec Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Sat, 25 Apr 2026 22:38:48 -0400
Subject: [PATCH 15/33] try to improve on docs

---
 docs/src/man/indexmanipulations.md | 132 +++++++++++++++--------------
 1 file changed, 70 insertions(+), 62 deletions(-)

diff --git a/docs/src/man/indexmanipulations.md b/docs/src/man/indexmanipulations.md
index 15908d02f..97de61974 100644
--- a/docs/src/man/indexmanipulations.md
+++ b/docs/src/man/indexmanipulations.md
@@ -5,103 +5,111 @@ using TensorKit
 using LinearAlgebra
 ```
 
-Tensor maps have a bipartition of their indices into a codomain and a domain.
-Index manipulations are operations that reorganize this structure: reordering indices, moving them between domain and codomain, flipping arrows, applying twists, or inserting and removing trivial factors.
+A `TensorMap{T, S, N₁, N₂}` is a linear map from a domain (a `ProductSpace{S, N₂}`) to a codomain (a `ProductSpace{S, N₁}`).
+In practice, the bipartition of the `N₁ + N₂` indices between domain and codomain is often not fixed: algorithms typically need to reshuffle indices between the two sides, reorder them, or change the arrow direction on individual indices before passing a tensor to a factorization or contraction.
 
-Throughout this page, index positions are specified using `Index2Tuple{N₁,N₂}`, i.e. a pair `(p₁, p₂)` of tuples.
-The indices in `p₁` form the new codomain, and those in `p₂` form the new domain.
-The helper functions [`codomainind`](@ref), [`domainind`](@ref), [`allind`](@ref), [`numout`](@ref) and [`numin`](@ref) are available to retrieve the current index structure of a tensor.
+Index manipulations cover all such operations.
+They act on the structure of the tensor data in a way that is fully determined by the categorical data of the `sectortype`, such that TensorKit automatically manipulates the tensor entries accordingly.
+The operations fall into three groups, which mirror the structure of the source file:
 
-## Permuting and braiding
+*   **Reweighting**: [`flip`](@ref) and [`twist`](@ref) apply local isomorphisms to individual indices without changing the index structure.
+*   **Space insertion/removal**: [`insertleftunit`](@ref), [`insertrightunit`](@ref) and [`removeunit`](@ref) add or remove trivial (scalar) index factors.
+*   **Index rearrangements**: [`permute`](@ref), [`braid`](@ref), [`transpose`](@ref) and [`repartition`](@ref) reorder indices and/or move them between domain and codomain.
 
-For sector types with a symmetric braiding (`BraidingStyle(I) isa SymmetricBraiding`), use [`permute`](@ref):
+Throughout this page, new index positions are specified using `Index2Tuple{N₁, N₂}`, i.e. a pair `(p₁, p₂)` of index tuples.
+The indices listed in `p₁` form the new codomain and those in `p₂` form the new domain.
+The following helpers retrieve the current index structure of a tensor:
 
 ```@docs; canonical=false
-permute(::AbstractTensorMap, ::Index2Tuple)
-permute!(::AbstractTensorMap, ::AbstractTensorMap, ::Index2Tuple)
+numout
+numin
+numind
+codomainind
+domainind
+allind
 ```
 
-For general braiding, use [`braid`](@ref), which requires an additional `levels` argument that assigns a height to each index.
-When two indices need to exchange places, the index with the higher level crosses over the index with the lower level.
+## Reweighting
 
-```@docs; canonical=false
-braid(::AbstractTensorMap, ::Index2Tuple, ::IndexTuple)
-braid!
-```
+Reweighting operations modify the entries of a tensor by applying local isomorphisms to individual indices, without changing the number of indices or their partition between domain and codomain.
 
-For plain tensors (`sectortype(t) == Trivial`), `permute` acts exactly like `permutedims` on the underlying array data:
+[`flip`](@ref) changes the arrow direction on selected indices by applying the corresponding isomorphism between a space and its dual.
+[`twist`](@ref) applies the topological spin (monoidal twist) to selected indices; for `BraidingStyle(I) == Bosonic()` this is always trivial.
 
-```@repl indexmanip
-V = ℂ^2;
-t = randn(V ⊗ V ← V ⊗ V);
-ta = convert(Array, t);
-t′ = permute(t, ((4, 2, 3), (1,)));
-convert(Array, t′) ≈ permutedims(ta, (4, 2, 3, 1))
+```@docs; canonical=false
+flip(t::AbstractTensorMap, I)
+twist(::AbstractTensorMap, ::Int)
+twist!
 ```
 
-## Transposing and repartitioning
-
-[`transpose`](@ref) is a special case of braiding restricted to *cyclic permutations*, i.e. permutations where indices do not cross.
-Unlike a generic `braid`, it introduces a compensating (inverse) twist, which is necessary to satisfy the categorical definition of transpose.
+## Inserting and removing unit spaces
 
-```@raw html
-<img src="../img/tensor-transpose.svg" alt="transpose" class="color-invertible"/>
-```
+These functions add or remove a trivial tensor product factor at a specified index position, without affecting any other indices.
+[`insertleftunit`](@ref) inserts before position `i` and [`insertrightunit`](@ref) inserts after position `i`; [`removeunit`](@ref) undoes either insertion.
+Passing `Val(i)` instead of an `Int` for the position may improve type stability.
 
 ```@docs; canonical=false
-transpose(::AbstractTensorMap, ::Index2Tuple)
-transpose!
+insertleftunit(::AbstractTensorMap, ::Val{i}) where {i}
+insertrightunit(::AbstractTensorMap, ::Val{i}) where {i}
+removeunit(::AbstractTensorMap, ::Val{i}) where {i}
 ```
 
-[`repartition`](@ref) is a further special case that only changes the codomain/domain split while preserving cyclic order:
-
-```@docs; canonical=false
-repartition(::AbstractTensorMap, ::Int, ::Int)
-repartition!
-```
+## Index rearrangements
 
-## Flipping arrows
+These operations reorder indices and/or move them between domain and codomain by applying the transposing or braiding isomorphisms of the underlying category.
+They form a hierarchy from most general to most restricted:
 
-[`flip`](@ref) applies an isomorphism to change the arrow direction on selected indices:
+- [`braid`](@ref) is the most general: it accepts any permutation and requires a `levels` argument — a tuple of heights, one per index — that determines whether each index crosses over or under the others it has to pass.
+- [`permute`](@ref) is a simpler interface for sector types with a symmetric braiding (`BraidingStyle(I) isa SymmetricBraiding`), where over- and under-crossings are equivalent and `levels` is therefore not needed.
+- [`transpose`](@ref) is restricted to *cyclic* permutations (indices do not cross). Unlike `braid`, it introduces a compensating (inverse) twist to satisfy the categorical definition of transpose, as illustrated below:
 
-```@docs; canonical=false
-flip(t::AbstractTensorMap, I)
+```@raw html
+<img src="../img/tensor-transpose.svg" alt="transpose" class="color-invertible"/>
 ```
 
-!!! note
-    `flip` is not involutory: `flip(flip(t, I), I) ≠ t` in general.
-    Use `flip(flip(t, I), I; inv=true)` to recover the original tensor.
+- [`repartition`](@ref) only moves the codomain/domain boundary without reordering the indices at all.
 
-## Twisting
+For plain tensors (`sectortype(t) == Trivial`), `permute` and `braid` act like `permutedims` on the underlying array:
 
-[`twist`](@ref) applies the monoidal twist to one or more indices.
-For `BraidingStyle(I) == Bosonic()`, all twists are trivial and `twist` returns the tensor unchanged.
-
-```@docs; canonical=false
-twist(::AbstractTensorMap, ::Int)
-twist!
+```@repl indexmanip
+V = ℂ^2;
+t = randn(V ⊗ V ← V ⊗ V);
+ta = convert(Array, t);
+t′ = permute(t, ((4, 2, 3), (1,)));
+convert(Array, t′) ≈ permutedims(ta, (4, 2, 3, 1))
 ```
 
-## Inserting and removing unit spaces
-
-The following functions insert or remove a trivial tensor product factor (a space isomorphic to the scalar field) at a given position.
-Passing `Val(i)` instead of an integer `i` improves type stability.
-
 ```@docs; canonical=false
-insertleftunit(::AbstractTensorMap, ::Val{i}) where {i}
-insertrightunit(::AbstractTensorMap, ::Val{i}) where {i}
-removeunit(::AbstractTensorMap, ::Val{i}) where {i}
+braid(::AbstractTensorMap, ::Index2Tuple, ::IndexTuple)
+braid!
+permute(::AbstractTensorMap, ::Index2Tuple)
+permute!(::AbstractTensorMap, ::AbstractTensorMap, ::Index2Tuple)
+transpose(::AbstractTensorMap, ::Index2Tuple)
+transpose!
+repartition(::AbstractTensorMap, ::Int, ::Int)
+repartition!
 ```
 
 ## Fusing and splitting indices
 
 There is no dedicated function for fusing or splitting indices.
 For a plain tensor (`sectortype(t) == Trivial`), this is equivalent to `reshape` on the underlying array.
-In the general case, one can construct an explicit isomorphism using [`isomorphism`](@ref) (or [`unitary`](@ref) for Euclidean spaces) and contract it with the tensor:
+
+In the general case there is no canonical embedding of `V1 ⊗ V2` into the fused space `V = fuse(V1 ⊗ V2)`: any two such embeddings differ by a basis transform, i.e. there is a gauge freedom.
+TensorKit resolves this by requiring the user to construct an explicit isomorphism — the *fuser* — and contract it with the tensor:
+
+```julia
+f = unitary(fuse(space(t, i) ⊗ space(t, j)), space(t, i) ⊗ space(t, j))
+@tensor t_fused[…, a, …] := f[a, i, j] * t[…, i, j, …]
+```
+
+Splitting is then the adjoint of the same map:
 
 ```julia
-u = unitary(fuse(space(t, i) ⊗ space(t, j)), space(t, i) ⊗ space(t, j))
-# then contract u with indices i and j of t via @tensor
+@tensor t_split[…, i, j, …] := f'[i, j, a] * t_fused[…, a, …]
 ```
 
+Using `f'` as the splitter guarantees that the round-trip is the identity, i.e. `t_split == t`.
+Using a *different* isomorphism to split would give a physically equivalent but numerically different tensor, so it is important to keep `f` and its adjoint consistent throughout a calculation.
+
 Note that tensor factorizations (SVD, QR, etc.) can be applied directly to any index bipartition without needing to fuse indices first; see [Tensor factorizations](@ref ss_tensor_factorization).

From 5f742d93245d2670785981a5660d49419bbd72f9 Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Sat, 25 Apr 2026 22:59:34 -0400
Subject: [PATCH 16/33] rework buffer interaction

---
 src/tensors/indexmanipulations.jl | 16 ++++++++--------
 src/tensors/treetransformers.jl   | 28 ----------------------------
 2 files changed, 8 insertions(+), 36 deletions(-)

diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl
index 4a1cfc997..a5d618872 100644
--- a/src/tensors/indexmanipulations.jl
+++ b/src/tensors/indexmanipulations.jl
@@ -592,7 +592,6 @@ function add_transform_kernel!(
         # Non-Abelian fusion: trees sharing the same set of uncoupled (external) sectors
         # form a *fusion block* and mix under the transformation via a recoupling matrix U
         # (rows = destination trees, columns = source trees). We iterate over blocks.
-        tl_buffers = OhMyThreads.TaskLocalValue(() -> allocate_buffers(tdst, tsrc, transformer, allocator))
         tforeach(fusionblocks(tsrc); scheduler) do src
             dst, U = transformer(src)
             if length(src) == 1
@@ -611,19 +610,19 @@ function add_transform_kernel!(
                 #   2. Recoupling: buffer_dst = buffer_src * U^T  (blocksize × rows)
                 #   3. Insert: scatter columns of buffer_dst to destination blocks,
                 #      applying the actual permutation p in the same step.
-                buffer1, buffer2 = tl_buffers[]
                 rows, cols = size(U)
                 sz_src = size(tsrc[first(fusiontrees(src))...])
                 blocksize = prod(sz_src)
                 ptriv = (ntuple(identity, length(sz_src)), ())
-                buffer_src = StridedView(buffer2, (blocksize, cols), (1, blocksize), 0)
+                buffer = TO.tensoralloc(storagetype(tdst), blocksize * (rows + cols), Val(true), allocator)
+                buffer_dst = StridedView(buffer, (blocksize, rows), (1, blocksize), 0)
+                buffer_src = StridedView(buffer, (blocksize, cols), (1, blocksize), blocksize * rows)
                 @inbounds for (i, (f₁, f₂)) in enumerate(fusiontrees(src))
                     TO.tensoradd!(
                         sreshape(buffer_src[:, i], sz_src), tsrc[f₁, f₂],
                         ptriv, false, One(), Zero(), backend, allocator
                     )
                 end
-                buffer_dst = StridedView(buffer1, (blocksize, rows), (1, blocksize), 0)
                 mul!(buffer_dst, buffer_src, transpose(StridedView(U)))
                 @inbounds for (i, (f₃, f₄)) in enumerate(fusiontrees(dst))
                     TO.tensoradd!(
@@ -631,6 +630,7 @@ function add_transform_kernel!(
                         p, false, α, β, backend, allocator
                     )
                 end
+                TO.tensorfree!(buffer, allocator)
             end
         end
     end
@@ -663,7 +663,6 @@ function add_transform_kernel!(
     #   U            — recoupling matrix (rows = dst trees, cols = src trees)
     #   sz_{dst,src} — array shape of each block (same for all trees in the block)
     #   structs_{dst,src}[i] — (offset, strides) into the flat data vector for tree i
-    tl_buffers = OhMyThreads.TaskLocalValue(() -> allocate_buffers(data_dst, data_src, transformer, allocator))
     tforeach(transformer.data; scheduler) do (U, (sz_dst, structs_dst), (sz_src, structs_src))
         if length(U) == 1
             # Degenerate block with a single tree: no matmul needed.
@@ -676,14 +675,15 @@ function add_transform_kernel!(
         else
             # Multi-tree block: pack → recoupling matmul → unpack.
             # buffer2 = source staging area, buffer1 = destination staging area.
-            buffer1, buffer2 = tl_buffers[]
             rows, cols = size(U)
             blocksize = prod(sz_src)
             ptriv = (ntuple(identity, length(sz_src)), ())
+            buffer = TO.tensoralloc(typeof(data_dst), blocksize * (rows + cols), Val(true), allocator)
+            buffer_dst = StridedView(buffer, (blocksize, rows), (1, blocksize), 0)
+            buffer_src = StridedView(buffer, (blocksize, cols), (1, blocksize), blocksize * rows)
 
             # 1. Extract: copy each source block into column i of buffer_src as a flat vector,
             #    using a trivial permutation so the layout is canonical before the matmul.
-            buffer_src = StridedView(buffer2, (blocksize, cols), (1, blocksize), 0)
             @inbounds for (i, struct_src_i) in enumerate(structs_src)
                 TO.tensoradd!(
                     sreshape(buffer_src[:, i], sz_src), StridedView(data_src, sz_src, struct_src_i...),
@@ -693,7 +693,6 @@ function add_transform_kernel!(
 
             # 2. Recoupling: buffer_dst = buffer_src * U^T  (each output tree is a linear
             #    combination of input trees weighted by the recoupling coefficients).
-            buffer_dst = StridedView(buffer1, (blocksize, rows), (1, blocksize), 0)
             mul!(buffer_dst, buffer_src, transpose(StridedView(U)))
 
             # 3. Insert: scatter column i of buffer_dst into the destination, applying the
@@ -704,6 +703,7 @@ function add_transform_kernel!(
                     p, false, α, β, backend, allocator
                 )
             end
+            TO.tensorfree!(buffer, allocator)
         end
     end
     return nothing
diff --git a/src/tensors/treetransformers.jl b/src/tensors/treetransformers.jl
index b602e11ea..aaecbe746 100644
--- a/src/tensors/treetransformers.jl
+++ b/src/tensors/treetransformers.jl
@@ -128,34 +128,6 @@ function repack_transformer_structure(structures::Dictionary, trees)
     return sz, strides_offsets
 end
 
-function buffersize(transformer::GenericTreeTransformer)
-    return maximum(transformer.data; init = 0) do (basistransform, structures_dst, _)
-        return prod(structures_dst[1]) * size(basistransform, 1)
-    end
-end
-
-function allocate_buffers(
-        tdst::TensorMap, tsrc::TensorMap, transformer::GenericTreeTransformer,
-        allocator = TO.DefaultAllocator()
-    )
-    sz = buffersize(transformer)
-    return similar(tdst.data, sz), similar(tsrc.data, sz)
-end
-function allocate_buffers(
-        data_dst::DenseVector, data_src::DenseVector, transformer::GenericTreeTransformer,
-        allocator = TO.DefaultAllocator()
-    )
-    sz = buffersize(transformer)
-    return similar(data_dst, sz), similar(data_src, sz)
-end
-function allocate_buffers(
-        tdst::AbstractTensorMap, tsrc::AbstractTensorMap, transformer,
-        allocator = TO.DefaultAllocator()
-    )
-    # be pessimistic and assume the worst for now
-    sz = dim(space(tsrc))
-    return similar(storagetype(tdst), sz), similar(storagetype(tsrc), sz)
-end
 
 function treetransformertype(Vdst, Vsrc)
     I = sectortype(Vdst)

From 48631de2c634b9a7498f726e293f9f4a5aae1b67 Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Sat, 25 Apr 2026 23:04:36 -0400
Subject: [PATCH 17/33] insert allocator checkpoints/resets

---
 Project.toml                      | 2 +-
 src/tensors/indexmanipulations.jl | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index af43fc925..75db02f08 100644
--- a/Project.toml
+++ b/Project.toml
@@ -55,7 +55,7 @@ Random = "1"
 ScopedValues = "1.3.0"
 Strided = "2"
 TensorKitSectors = "0.3.7"
-TensorOperations = "5.1"
+TensorOperations = "5.5"
 TupleTools = "1.5"
 VectorInterface = "0.4.8, 0.5"
 cuTENSOR = "6"
diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl
index a5d618872..c1cb47671 100644
--- a/src/tensors/indexmanipulations.jl
+++ b/src/tensors/indexmanipulations.jl
@@ -589,6 +589,7 @@ function add_transform_kernel!(
             )
         end
     else
+        cp = TO.allocator_checkpoint!(allocator)
         # Non-Abelian fusion: trees sharing the same set of uncoupled (external) sectors
         # form a *fusion block* and mix under the transformation via a recoupling matrix U
         # (rows = destination trees, columns = source trees). We iterate over blocks.
@@ -633,6 +634,7 @@ function add_transform_kernel!(
                 TO.tensorfree!(buffer, allocator)
             end
         end
+        TO.allocator_reset!(allocator, cp)
     end
     return nothing
 end
@@ -663,6 +665,7 @@ function add_transform_kernel!(
     #   U            — recoupling matrix (rows = dst trees, cols = src trees)
     #   sz_{dst,src} — array shape of each block (same for all trees in the block)
     #   structs_{dst,src}[i] — (offset, strides) into the flat data vector for tree i
+    cp = TO.allocator_checkpoint!(allocator)
     tforeach(transformer.data; scheduler) do (U, (sz_dst, structs_dst), (sz_src, structs_src))
         if length(U) == 1
             # Degenerate block with a single tree: no matmul needed.
@@ -706,5 +709,6 @@ function add_transform_kernel!(
             TO.tensorfree!(buffer, allocator)
         end
     end
+    TO.allocator_reset!(allocator, cp)
     return nothing
 end

From 538c30b7cfec383b919abf01ee4f8e58af33a76d Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Sun, 26 Apr 2026 08:57:29 -0400
Subject: [PATCH 18/33] attempt to improve `cond` precision in test

---
 test/factorizations/svd.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/factorizations/svd.jl b/test/factorizations/svd.jl
index 0678db827..e2eb19076 100644
--- a/test/factorizations/svd.jl
+++ b/test/factorizations/svd.jl
@@ -49,9 +49,9 @@ for V in spacelist
             end
             for T in eltypes, t in (randn(T, W, W), randn(T, W, W)')
                 project_hermitian!(t)
-                vals = @constinferred LinearAlgebra.eigvals(t)
-                λmax = maximum(s -> maximum(abs, s), values(vals))
-                λmin = minimum(s -> minimum(abs, s), values(vals))
+                vals = @constinferred eigh_vals(t)
+                λmax = maximum(abs, vals)
+                λmin = minimum(abs, vals)
                 @test cond(t) ≈ λmax / λmin
             end
         end

From 4fba1f2b8345663fa1e24a22a823b38703a25895 Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Sun, 26 Apr 2026 13:06:05 -0400
Subject: [PATCH 19/33] add hook for `adapt_transformer`

---
 ext/TensorKitAMDGPUExt/roctensormap.jl | 9 +++++++++
 ext/TensorKitCUDAExt/cutensormap.jl    | 9 +++++++--
 src/tensors/indexmanipulations.jl      | 1 +
 src/tensors/treetransformers.jl        | 9 +++++++++
 4 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/ext/TensorKitAMDGPUExt/roctensormap.jl b/ext/TensorKitAMDGPUExt/roctensormap.jl
index f2f094c60..c65f91062 100644
--- a/ext/TensorKitAMDGPUExt/roctensormap.jl
+++ b/ext/TensorKitAMDGPUExt/roctensormap.jl
@@ -162,3 +162,12 @@ for f in (:sqrt, :log, :asin, :acos, :acosh, :atanh, :acoth)
         return tf
     end
 end
+
+function TensorKit.adapt_transformer(
+        t::TensorKit.GenericTreeTransformer, data::ROCVector
+    )
+    new_data = map(t.data) do (U, structs_dst, structs_src)
+        return AMDGPU.Adapt.adapt(ROCArray, U), structs_dst, structs_src
+    end
+    return TensorKit.GenericTreeTransformer(new_data)
+end
diff --git a/ext/TensorKitCUDAExt/cutensormap.jl b/ext/TensorKitCUDAExt/cutensormap.jl
index 02ca2d5d6..aec9b2313 100644
--- a/ext/TensorKitCUDAExt/cutensormap.jl
+++ b/ext/TensorKitCUDAExt/cutensormap.jl
@@ -156,6 +156,11 @@ for f in (:sqrt, :log, :asin, :acos, :acosh, :atanh, :acoth)
     end
 end
 
-function TensorKit._add_transform_multi!(tdst::CuTensorMap, tsrc, p, (U, structs_dst, structs_src)::Tuple{<:Array, TD, TS}, buffers, alpha, beta, backend...) where {TD, TS}
-    return TensorKit._add_transform_multi!(tdst, tsrc, p, (CUDA.CUDACore.Adapt.adapt(CuArray, U), structs_dst, structs_src), buffers, alpha, beta, backend...)
+function TensorKit.adapt_transformer(
+        t::TensorKit.GenericTreeTransformer, data::CuVector
+    )
+    new_data = map(t.data) do (U, structs_dst, structs_src)
+        return CUDA.Adapt.adapt(CuArray, U), structs_dst, structs_src
+    end
+    return TensorKit.GenericTreeTransformer(new_data)
 end
diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl
index c1cb47671..84865d5e4 100644
--- a/src/tensors/indexmanipulations.jl
+++ b/src/tensors/indexmanipulations.jl
@@ -661,6 +661,7 @@ function add_transform_kernel!(
         data_dst::DenseVector, data_src::DenseVector, p, transformer::GenericTreeTransformer,
         α, β, backend, allocator, scheduler
     )
+    transformer = adapt_transformer(transformer, data_dst)
     # Each entry covers one fusion block:
     #   U            — recoupling matrix (rows = dst trees, cols = src trees)
     #   sz_{dst,src} — array shape of each block (same for all trees in the block)
diff --git a/src/tensors/treetransformers.jl b/src/tensors/treetransformers.jl
index aaecbe746..664f8a0d6 100644
--- a/src/tensors/treetransformers.jl
+++ b/src/tensors/treetransformers.jl
@@ -203,3 +203,12 @@ end
 function _transformer_weight((mat, structs_dst, structs_src)::GenericTransformerData)
     return length(mat) * prod(structs_dst[1])
 end
+
+"""
+    adapt_transformer(transformer::TreeTransformer, data::AbstractVector)
+
+Return a version of `transformer` whose internal arrays are compatible with `data`.
+Default is a no-op. Backends (e.g. CUDA, AMDGPU) should overload this for their vector types
+to ensure the recoupling matrix `U` inside `GenericTreeTransformer` is on the correct device.
+"""
+adapt_transformer(t::TreeTransformer, ::AbstractVector) = t

From c70de0c282f949a65a63e25558513d87bcec0753 Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Sun, 26 Apr 2026 13:55:59 -0400
Subject: [PATCH 20/33] collapse docstrings

---
 docs/src/lib/fusiontrees.md        | 1 +
 docs/src/lib/sectors.md            | 1 +
 docs/src/lib/spaces.md             | 1 +
 docs/src/lib/tensors.md            | 1 +
 docs/src/man/indexmanipulations.md | 9 ++++-----
 5 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/docs/src/lib/fusiontrees.md b/docs/src/lib/fusiontrees.md
index 8e037af93..57033eca6 100644
--- a/docs/src/lib/fusiontrees.md
+++ b/docs/src/lib/fusiontrees.md
@@ -2,6 +2,7 @@
 
 ```@meta
 CurrentModule = TensorKit
+CollapsedDocStrings = true
 ```
 
 # Type hierarchy
diff --git a/docs/src/lib/sectors.md b/docs/src/lib/sectors.md
index f56980bf0..8a696c675 100644
--- a/docs/src/lib/sectors.md
+++ b/docs/src/lib/sectors.md
@@ -2,6 +2,7 @@
 
 ```@meta
 CurrentModule = TensorKit
+CollapsedDocStrings = true
 ```
 
 ## Type hierarchy
diff --git a/docs/src/lib/spaces.md b/docs/src/lib/spaces.md
index e5705fe3e..a6cce06a1 100644
--- a/docs/src/lib/spaces.md
+++ b/docs/src/lib/spaces.md
@@ -2,6 +2,7 @@
 
 ```@meta
 CurrentModule = TensorKit
+CollapsedDocStrings = true
 ```
 
 ## Type hierarchy
diff --git a/docs/src/lib/tensors.md b/docs/src/lib/tensors.md
index 22e4c2c1b..1ef3da0fb 100644
--- a/docs/src/lib/tensors.md
+++ b/docs/src/lib/tensors.md
@@ -2,6 +2,7 @@
 
 ```@meta
 CurrentModule = TensorKit
+CollapsedDocStrings = true
 ```
 
 ## Type hierarchy
diff --git a/docs/src/man/indexmanipulations.md b/docs/src/man/indexmanipulations.md
index 97de61974..bc5cc331e 100644
--- a/docs/src/man/indexmanipulations.md
+++ b/docs/src/man/indexmanipulations.md
@@ -1,5 +1,9 @@
 # [Index manipulations](@id s_indexmanipulations)
 
+```@meta
+CollapsedDocStrings = true
+```
+
 ```@setup indexmanip
 using TensorKit
 using LinearAlgebra
@@ -62,11 +66,6 @@ They form a hierarchy from most general to most restricted:
 - [`braid`](@ref) is the most general: it accepts any permutation and requires a `levels` argument — a tuple of heights, one per index — that determines whether each index crosses over or under the others it has to pass.
 - [`permute`](@ref) is a simpler interface for sector types with a symmetric braiding (`BraidingStyle(I) isa SymmetricBraiding`), where over- and under-crossings are equivalent and `levels` is therefore not needed.
 - [`transpose`](@ref) is restricted to *cyclic* permutations (indices do not cross). Unlike `braid`, it introduces a compensating (inverse) twist to satisfy the categorical definition of transpose, as illustrated below:
-
-```@raw html
-<img src="../img/tensor-transpose.svg" alt="transpose" class="color-invertible"/>
-```
-
 - [`repartition`](@ref) only moves the codomain/domain boundary without reordering the indices at all.
 
 For plain tensors (`sectortype(t) == Trivial`), `permute` and `braid` act like `permutedims` on the underlying array:

From 8512ad58d943e264ef76461f4e599be56d0e0f46 Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Sun, 26 Apr 2026 14:18:00 -0400
Subject: [PATCH 21/33] update docs structure

---
 docs/make.jl                        |  10 +-
 docs/src/man/contractions.md        | 106 +++++++++++++
 docs/src/man/factorizations.md      |  46 ++++++
 docs/src/man/indexmanipulations.md  |   2 +-
 docs/src/man/linearalgebra.md       |  85 ++++++++++
 docs/src/man/tensormanipulations.md | 238 ----------------------------
 docs/src/man/tensors.md             |   2 +-
 7 files changed, 247 insertions(+), 242 deletions(-)
 create mode 100644 docs/src/man/contractions.md
 create mode 100644 docs/src/man/factorizations.md
 create mode 100644 docs/src/man/linearalgebra.md
 delete mode 100644 docs/src/man/tensormanipulations.md

diff --git a/docs/make.jl b/docs/make.jl
index 5517755ab..d72b851a6 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -26,8 +26,14 @@ pages = [
         "man/intro.md", "man/tutorial.md",
         "man/spaces.md", "man/symmetries.md",
         "man/sectors.md", "man/gradedspaces.md",
-        "man/fusiontrees.md", "man/tensors.md",
-        "man/indexmanipulations.md", "man/tensormanipulations.md",
+        "man/fusiontrees.md",
+        "Tensors" => [
+            "man/tensors.md",
+            "man/linearalgebra.md",
+            "man/indexmanipulations.md",
+            "man/factorizations.md",
+            "man/contractions.md",
+        ],
     ],
     "Library" => [
         "lib/sectors.md", "lib/fusiontrees.md",
diff --git a/docs/src/man/contractions.md b/docs/src/man/contractions.md
new file mode 100644
index 000000000..d70b5681d
--- /dev/null
+++ b/docs/src/man/contractions.md
@@ -0,0 +1,106 @@
+# [Tensor contractions and tensor networks](@id ss_tensor_contraction)
+
+One of the most important operation with tensor maps is to compose them, more generally known as contracting them.
+As mentioned in the section on [category theory](@ref s_categories), a typical composition of maps in a ribbon category can graphically be represented as a planar arrangement of the morphisms (i.e. tensor maps, boxes with lines eminating from top and bottom, corresponding to source and target, i.e. domain and codomain), where the lines connecting the source and targets of the different morphisms should be thought of as ribbons, that can braid over or underneath each other, and that can twist.
+Technically, we can embed this diagram in ``ℝ × [0,1]`` and attach all the unconnected line endings corresponding objects in the source at some position ``(x,0)`` for ``x∈ℝ``, and all line endings corresponding to objects in the target at some position ``(x,1)``.
+The resulting morphism is then invariant under what is known as *framed three-dimensional isotopy*, i.e. three-dimensional rearrangements of the morphism that respect the rules of boxes connected by ribbons whose open endings are kept fixed.
+Such a two-dimensional diagram cannot easily be encoded in a single line of code.
+
+However, things simplify when the braiding is symmetric (such that over- and under- crossings become equivalent, i.e. just crossings), and when twists, i.e. self-crossings in this case, are trivial.
+This amounts to `BraidingStyle(I) == Bosonic()` in the language of TensorKit.jl, and is true for any subcategory of ``\mathbf{Vect}``, i.e. ordinary tensors, possibly with some symmetry constraint.
+The case of ``\mathbf{SVect}`` and its subcategories, and more general categories, are discussed below.
+
+In the case of trivial twists, we can deform the diagram such that we first combine every morphism with a number of coevaluations ``η`` so as to represent it as a tensor, i.e. with a trivial domain.
+We can then rearrange the morphism to be all ligned up horizontally, where the original morphism compositions are now being performed by evaluations ``ϵ``.
+This process will generate a number of crossings and twists, where the latter can be omitted because they act trivially.
+Similarly, double crossings can also be omitted.
+As a consequence, the diagram, or the morphism it represents, is completely specified by the tensors it is composed of, and which indices between the different tensors are connect, via the evaluation ``ϵ``, and which indices make up the source and target of the resulting morphism.
+If we also compose the resulting morphisms with coevaluations so that it has a trivial domain, we just have one type of unconnected lines, henceforth called open indices.
+We sketch such a rearrangement in the following picture
+
+```@raw html
+<img src="../img/tensor-bosoniccontraction.svg" alt="tensor unitary" class="color-invertible"/>
+```
+
+Hence, we can now specify such a tensor diagram, henceforth called a tensor contraction or also tensor network, using a one-dimensional syntax that mimicks [abstract index notation](https://en.wikipedia.org/wiki/Abstract_index_notation) and specifies which indices are connected by the evaluation map using Einstein's summation conventation.
+Indeed, for `BraidingStyle(I) == Bosonic()`, such a tensor contraction can take the same format as if all tensors were just multi-dimensional arrays.
+For this, we rely on the interface provided by the package [TensorOperations.jl](https://github.com/QuantumKitHub/TensorOperations.jl).
+
+The above picture would be encoded as
+```julia
+@tensor E[a, b, c, d, e] := A[v, w, d, x] * B[y, z, c, x] * C[v, e, y, b] * D[a, w, z]
+```
+or
+```julia
+@tensor E[:] := A[1, 2, -4, 3] * B[4, 5, -3, 3] * C[1, -5, 4, -2] * D[-1, 2, 5]
+```
+where the latter syntax is known as NCON-style, and labels the unconnected or outgoing indices with negative integers, and the contracted indices with positive integers.
+
+A number of remarks are in order.
+TensorOperations.jl accepts both integers and any valid variable name as dummy label for indices, and everything in between `[ ]` is not resolved in the current context but interpreted as a dummy label.
+Here, we label the indices of a `TensorMap`, like `A::TensorMap{T, S, N₁, N₂}`, in a linear fashion, where the first position corresponds to the first space in `codomain(A)`, and so forth, up to position `N₁`.
+Index `N₁ + 1` then corresponds to the first space in `domain(A)`.
+However, because we have applied the coevaluation ``η``, it actually corresponds to the corresponding dual space, in accordance with the interface of [`space(A, i)`](@ref) that we introduced [above](@ref ss_tensor_properties), and as indiated by the dotted box around ``A`` in the above picture.
+The same holds for the other tensor maps.
+Note that our convention also requires that we braid indices that we brought from the domain to the codomain, and so this is only unambiguous for a symmetric braiding, where there is a unique way to permute the indices.
+
+With the current syntax, we create a new object `E` because we use the definition operator `:=`.
+Furthermore, with the current syntax, it will be a `Tensor`, i.e. it will have a trivial domain, and correspond to the dotted box in the picture above, rather than the actual morphism `E`.
+We can also directly define `E` with the correct codomain and domain by rather using
+```julia
+@tensor E[a b c;d e] := A[v, w, d, x] * B[y, z, c, x] * C[v, e, y, b] * D[a, w, z]
+```
+or
+```julia
+@tensor E[(a, b, c);(d, e)] := A[v, w, d, x] * B[y, z, c, x] * C[v, e, y, b] * D[a, w, z]
+```
+where the latter syntax can also be used when the codomain is empty.
+When using the assignment operator `=`, the `TensorMap` `E` is assumed to exist and the contents will be written to the currently allocated memory.
+Note that for existing tensors, both on the left hand side and right hand side, trying to specify the indices in the domain and the codomain seperately using the above syntax, has no effect, as the bipartition of indices are already fixed by the existing object.
+Hence, if `E` has been created by the previous line of code, all of the following lines are now equivalent
+```julia
+@tensor E[(a, b, c);(d, e)] = A[v, w, d, x] * B[y, z, c, x] * C[v, e, y, b] * D[a, w, z]
+@tensor E[a, b, c, d, e] = A[v w d; x] * B[(y, z, c); (x, )] * C[v e y; b] * D[a, w, z]
+@tensor E[a b; c d e] = A[v; w d x] * B[y, z, c, x] * C[v, e, y, b] * D[a w; z]
+```
+and none of those will or can change the partition of the indices of `E` into its codomain and its domain.
+
+Two final remarks are in order.
+Firstly, the order of the tensors appearing on the right hand side is irrelevant, as we can reorder them by using the allowed moves of the Penrose graphical calculus, which yields some crossings and a twist.
+As the latter is trivial, it can be omitted, and we just use the same rules to evaluate the newly ordered tensor network.
+For the particular case of matrix-matrix multiplication, which also captures more general settings by appropriotely combining spaces into a single line, we indeed find
+
+```@raw html
+<img src="../img/tensor-contractionreorder.svg" alt="tensor contraction reorder" class="color-invertible"/>
+```
+
+or thus, the following two lines of code yield the same result
+```julia
+@tensor C[i, j] := B[i, k] * A[k, j]
+@tensor C[i, j] := A[k, j] * B[i, k]
+```
+Reordering of tensors can be used internally by the `@tensor` macro to evaluate the contraction in a more efficient manner.
+In particular, the NCON-style of specifying the contraction gives the user control over the order, and there are other macros, such as `@tensoropt`, that try to automate this process.
+There is also an `@ncon` macro and `ncon` function, an we recommend reading the [manual of TensorOperations.jl](https://quantumkithub.github.io/TensorOperations.jl/stable/) to learn more about the possibilities and how they work.
+
+A final remark involves the use of adjoints of tensors.
+The current framework is such that the user should not be too worried about the actual bipartition into codomain and domain of a given `TensorMap` instance.
+Indeed, for tensor contractions the `@tensor` macro figures out the correct manipulations automatically.
+However, when wanting to use the `adjoint` of an instance `t::TensorMap{T, S, N₁, N₂}`, the resulting `adjoint(t)` is an `AbstractTensorMap{T, S, N₂, N₁}` and one needs to know the values of `N₁` and `N₂` to know exactly where the `i`th index of `t` will end up in `adjoint(t)`, and hence the index order of `t'`.
+Within the `@tensor` macro, one can instead use `conj()` on the whole index expression so as to be able to use the original index ordering of `t`.
+For example, for `TensorMap{T, S, 1, 1}` instances, this yields exactly the equivalence one expects, namely one between the following two expressions:
+
+```julia
+@tensor C[i, j] := B'[i, k] * A[k, j]
+@tensor C[i, j] := conj(B[k, i]) * A[k, j]
+```
+
+For e.g. an instance `A::TensorMap{T, S, 3, 2}`, the following two syntaxes have the same effect within an `@tensor` expression: `conj(A[a, b, c, d, e])` and `A'[d, e, a, b, c]`.
+
+## Fermionic tensor contractions
+
+TODO
+
+## Anyonic tensor contractions
+
+TODO
diff --git a/docs/src/man/factorizations.md b/docs/src/man/factorizations.md
new file mode 100644
index 000000000..2fbd8b382
--- /dev/null
+++ b/docs/src/man/factorizations.md
@@ -0,0 +1,46 @@
+# [Tensor factorizations](@id ss_tensor_factorization)
+
+```@setup tensors
+using TensorKit
+using LinearAlgebra
+```
+
+As tensors are linear maps, they suport various kinds of factorizations.
+These functions all interpret the provided `AbstractTensorMap` instances as a map from `domain` to `codomain`, which can be thought of as reshaping the tensor into a matrix according to the current bipartition of the indices.
+
+TensorKit's factorizations are provided by [MatrixAlgebraKit.jl](https://github.com/QuantumKitHub/MatrixAlgebraKit.jl), which is used to supply both the interface, as well as the implementation of the various operations on the blocks of data.
+For specific details on the provided functionality, we refer to its [documentation page](https://quantumkithub.github.io/MatrixAlgebraKit.jl/stable/user_interface/decompositions/).
+
+Finally, note that each of the factorizations takes the current partition of `domain` and `codomain` as the *axis* along which to matricize and perform the factorization.
+In order to obtain factorizations according to a different bipartition of the indices, we can use any of the previously mentioned [index manipulations](@ref s_indexmanipulations) before the factorization.
+
+Some examples to conclude this section
+```@repl tensors
+V1 = SU₂Space(0 => 2, 1/2 => 1)
+V2 = SU₂Space(0 => 1, 1/2 => 1, 1 => 1)
+
+t = randn(V1 ⊗ V1, V2);
+U, S, Vh = svd_compact(t);
+t ≈ U * S * Vh
+D, V = eigh_full(t' * t);
+D ≈ S * S
+U' * U ≈ id(domain(U))
+S
+
+Q, R = left_orth(t; alg = :svd);
+Q' * Q ≈ id(domain(Q))
+t ≈ Q * R
+
+U2, S2, Vh2, ε = svd_trunc(t; trunc = truncspace(V1));
+Vh2 * Vh2' ≈ id(codomain(Vh2))
+S2
+ε ≈ norm(block(S, Irrep[SU₂](1))) * sqrt(dim(Irrep[SU₂](1)))
+
+L, Q = right_orth(permute(t, ((1,), (2, 3))));
+codomain(L), domain(L), domain(Q)
+Q * Q'
+P = Q' * Q;
+P ≈ P * P
+t′ = permute(t, ((1,), (2, 3)));
+t′ ≈ t′ * P
+```
diff --git a/docs/src/man/indexmanipulations.md b/docs/src/man/indexmanipulations.md
index bc5cc331e..8379aac3d 100644
--- a/docs/src/man/indexmanipulations.md
+++ b/docs/src/man/indexmanipulations.md
@@ -65,7 +65,7 @@ They form a hierarchy from most general to most restricted:
 
 - [`braid`](@ref) is the most general: it accepts any permutation and requires a `levels` argument — a tuple of heights, one per index — that determines whether each index crosses over or under the others it has to pass.
 - [`permute`](@ref) is a simpler interface for sector types with a symmetric braiding (`BraidingStyle(I) isa SymmetricBraiding`), where over- and under-crossings are equivalent and `levels` is therefore not needed.
-- [`transpose`](@ref) is restricted to *cyclic* permutations (indices do not cross). Unlike `braid`, it introduces a compensating (inverse) twist to satisfy the categorical definition of transpose, as illustrated below:
+- [`transpose`](@ref) is restricted to *cyclic* permutations (indices do not cross).
 - [`repartition`](@ref) only moves the codomain/domain boundary without reordering the indices at all.
 
 For plain tensors (`sectortype(t) == Trivial`), `permute` and `braid` act like `permutedims` on the underlying array:
diff --git a/docs/src/man/linearalgebra.md b/docs/src/man/linearalgebra.md
new file mode 100644
index 000000000..98c9f489a
--- /dev/null
+++ b/docs/src/man/linearalgebra.md
@@ -0,0 +1,85 @@
+# [Basic linear algebra](@id ss_tensor_linalg)
+
+```@setup tensors
+using TensorKit
+using LinearAlgebra
+```
+
+`AbstractTensorMap` instances `t` represent linear maps, i.e. homomorphisms in a `𝕜`-linear category, just like matrices.
+To a large extent, they follow the interface of `Matrix` in Julia's `LinearAlgebra` standard library.
+Many methods from `LinearAlgebra` are (re)exported by TensorKit.jl, and can then us be used without `using LinearAlgebra` explicitly.
+In all of the following methods, the implementation acts directly on the underlying matrix blocks (typically using the same method) and never needs to perform any basis transforms.
+
+In particular, `AbstractTensorMap` instances can be composed, provided the domain of the first object coincides with the codomain of the second.
+Composing tensor maps uses the regular multiplication symbol as in `t = t1 * t2`, which is also used for matrix multiplication.
+TensorKit.jl also supports (and exports) the mutating method `mul!(t, t1, t2)`.
+We can then also try to invert a tensor map using `inv(t)`, though this can only exist if the domain and codomain are isomorphic, which can e.g. be checked as `fuse(codomain(t)) == fuse(domain(t))`.
+If the inverse is composed with another tensor `t2`, we can use the syntax `t1 \ t2` or `t2 / t1`.
+However, this syntax also accepts instances `t1` whose domain and codomain are not isomorphic, and then amounts to `pinv(t1)`, the Moore-Penrose pseudoinverse.
+This, however, is only really justified as minimizing the least squares problem if `InnerProductStyle(t) <: EuclideanProduct`.
+
+`AbstractTensorMap` instances behave themselves as vectors (i.e. they are `𝕜`-linear) and so they can be multiplied by scalars and, if they live in the same space, i.e. have the same domain and codomain, they can be added to each other.
+There is also a `zero(t)`, the additive identity, which produces a zero tensor with the same domain and codomain as `t`.
+In addition, `TensorMap` supports basic Julia methods such as `fill!` and `copy!`, as well as `copy(t)` to create a copy with independent data.
+Aside from basic `+` and `*` operations, TensorKit.jl reexports a number of efficient in-place methods from `LinearAlgebra`, such as `axpy!` (for `y ← α * x + y`), `axpby!` (for `y ← α * x + β * y`), `lmul!` and `rmul!` (for `y ← α * y` and `y ← y * α`, which is typically the same) and `mul!`, which can also be used for out-of-place scalar multiplication `y ← α * x`.
+
+For `S = spacetype(t)` where `InnerProductStyle(S) <: EuclideanProduct`, we can compute `norm(t)`, and for two such instances, the inner product `dot(t1, t2)`, provided `t1` and `t2` have the same domain and codomain.
+Furthermore, there is `normalize(t)` and `normalize!(t)` to return a scaled version of `t` with unit norm.
+These operations should also exist for `InnerProductStyle(S) <: HasInnerProduct`, but require an interface for defining a custom inner product in these spaces.
+Currently, there is no concrete subtype of `HasInnerProduct` that is not an `EuclideanProduct`.
+In particular, `CartesianSpace`, `ComplexSpace` and `GradedSpace` all have `InnerProductStyle(S) <: EuclideanProduct`.
+
+With tensors that have `InnerProductStyle(t) <: EuclideanProduct` there is associated an adjoint operation, given by `adjoint(t)` or simply `t'`, such that `domain(t') == codomain(t)` and `codomain(t') == domain(t)`.
+Note that for an instance `t::TensorMap{S, N₁, N₂}`, `t'` is simply stored in a wrapper called `AdjointTensorMap{S, N₂, N₁}`, which is another subtype of `AbstractTensorMap`.
+This should be mostly invisible to the user, as all methods should work for this type as well.
+It can be hard to reason about the index order of `t'`, i.e. index `i` of `t` appears in `t'` at index position `j = TensorKit.adjointtensorindex(t, i)`, where the latter method is typically not necessary and hence unexported.
+There is also a plural `TensorKit.adjointtensorindices` to convert multiple indices at once.
+Note that, because the adjoint interchanges domain and codomain, we have `space(t', j) == space(t, i)'`.
+
+`AbstractTensorMap` instances can furthermore be tested for exact (`t1 == t2`) or approximate (`t1 ≈ t2`) equality, though the latter requires that `norm` can be computed.
+
+When tensor map instances are endomorphisms, i.e. they have the same domain and codomain, there is a multiplicative identity which can be obtained as `one(t)` or `one!(t)`, where the latter overwrites the contents of `t`.
+The multiplicative identity on a space `V` can also be obtained using `id(A, V)` as discussed [above](@ref ss_tensor_construction), such that for a general homomorphism `t′`, we have `t′ == id(codomain(t′)) * t′ == t′ * id(domain(t′))`.
+Returning to the case of endomorphisms `t`, we can compute the trace via `tr(t)` and exponentiate them using `exp(t)`, or if the contents of `t` can be destroyed in the process, `exp!(t)`.
+Furthermore, there are a number of tensor factorizations for both endomorphisms and general homomorphisms that we discuss on the [Tensor factorizations](@ref ss_tensor_factorization) page.
+
+Finally, there are a number of operations that also belong in this paragraph because of their analogy to common matrix operations.
+The tensor product of two `TensorMap` instances `t1` and `t2` is obtained as `t1 ⊗ t2` and results in a new `TensorMap` with `codomain(t1 ⊗ t2) = codomain(t1) ⊗ codomain(t2)` and `domain(t1 ⊗ t2) = domain(t1) ⊗ domain(t2)`.
+If we have two `TensorMap{T, S, N, 1}` instances `t1` and `t2` with the same codomain, we can combine them in a way that is analogous to `hcat`, i.e. we stack them such that the new tensor `catdomain(t1, t2)` has also the same codomain, but has a domain which is `domain(t1) ⊕ domain(t2)`.
+Similarly, if `t1` and `t2` are of type `TensorMap{T, S, 1, N}` and have the same domain, the operation `catcodomain(t1, t2)` results in a new tensor with the same domain and a codomain given by `codomain(t1) ⊕ codomain(t2)`, which is the analogy of `vcat`.
+Note that direct sum only makes sense between `ElementarySpace` objects, i.e. there is no way to give a tensor product meaning to a direct sum of tensor product spaces.
+
+Time for some more examples:
+```@repl tensors
+using TensorKit # hide
+V1 = ℂ^2
+t = randn(V1 ← V1 ⊗ V1 ⊗ V1)
+t == t + zero(t) == t * id(domain(t)) == id(codomain(t)) * t
+t2 = randn(ComplexF64, codomain(t), domain(t));
+dot(t2, t)
+tr(t2' * t)
+dot(t2, t) ≈ dot(t', t2')
+dot(t2, t2)
+norm(t2)^2
+t3 = copy!(similar(t, ComplexF64), t);
+t3 == t
+rmul!(t3, 0.8);
+t3 ≈ 0.8 * t
+axpby!(0.5, t2, 1.3im, t3);
+t3 ≈ 0.5 * t2 + 0.8 * 1.3im * t
+t4 = randn(fuse(codomain(t)), codomain(t));
+t5 = TensorMap{Float64}(undef, fuse(codomain(t)), domain(t));
+mul!(t5, t4, t) == t4 * t
+inv(t4) * t4 ≈ id(codomain(t))
+t4 * inv(t4) ≈ id(fuse(codomain(t)))
+t4 \ (t4 * t) ≈ t
+t6 = randn(ComplexF64, V1, codomain(t));
+numout(t4) == numout(t6) == 1
+t7 = catcodomain(t4, t6);
+foreach(println, (codomain(t4), codomain(t6), codomain(t7)))
+norm(t7) ≈ sqrt(norm(t4)^2 + norm(t6)^2)
+t8 = t4 ⊗ t6;
+foreach(println, (codomain(t4), codomain(t6), codomain(t8)))
+foreach(println, (domain(t4), domain(t6), domain(t8)))
+norm(t8) ≈ norm(t4)*norm(t6)
+```
diff --git a/docs/src/man/tensormanipulations.md b/docs/src/man/tensormanipulations.md
deleted file mode 100644
index 2f238f963..000000000
--- a/docs/src/man/tensormanipulations.md
+++ /dev/null
@@ -1,238 +0,0 @@
-# [Manipulating tensors](@id s_tensormanipulations)
-
-## [Vector space and linear algebra operations](@id ss_tensor_linalg)
-
-`AbstractTensorMap` instances `t` represent linear maps, i.e. homomorphisms in a `𝕜`-linear category, just like matrices.
-To a large extent, they follow the interface of `Matrix` in Julia's `LinearAlgebra` standard library.
-Many methods from `LinearAlgebra` are (re)exported by TensorKit.jl, and can then us be used without `using LinearAlgebra` explicitly.
-In all of the following methods, the implementation acts directly on the underlying matrix blocks (typically using the same method) and never needs to perform any basis transforms.
-
-In particular, `AbstractTensorMap` instances can be composed, provided the domain of the first object coincides with the codomain of the second.
-Composing tensor maps uses the regular multiplication symbol as in `t = t1 * t2`, which is also used for matrix multiplication.
-TensorKit.jl also supports (and exports) the mutating method `mul!(t, t1, t2)`.
-We can then also try to invert a tensor map using `inv(t)`, though this can only exist if the domain and codomain are isomorphic, which can e.g. be checked as `fuse(codomain(t)) == fuse(domain(t))`.
-If the inverse is composed with another tensor `t2`, we can use the syntax `t1 \ t2` or `t2 / t1`.
-However, this syntax also accepts instances `t1` whose domain and codomain are not isomorphic, and then amounts to `pinv(t1)`, the Moore-Penrose pseudoinverse.
-This, however, is only really justified as minimizing the least squares problem if `InnerProductStyle(t) <: EuclideanProduct`.
-
-`AbstractTensorMap` instances behave themselves as vectors (i.e. they are `𝕜`-linear) and so they can be multiplied by scalars and, if they live in the same space, i.e. have the same domain and codomain, they can be added to each other.
-There is also a `zero(t)`, the additive identity, which produces a zero tensor with the same domain and codomain as `t`.
-In addition, `TensorMap` supports basic Julia methods such as `fill!` and `copy!`, as well as `copy(t)` to create a copy with independent data.
-Aside from basic `+` and `*` operations, TensorKit.jl reexports a number of efficient in-place methods from `LinearAlgebra`, such as `axpy!` (for `y ← α * x + y`), `axpby!` (for `y ← α * x + β * y`), `lmul!` and `rmul!` (for `y ← α * y` and `y ← y * α`, which is typically the same) and `mul!`, which can also be used for out-of-place scalar multiplication `y ← α * x`.
-
-For `S = spacetype(t)` where `InnerProductStyle(S) <: EuclideanProduct`, we can compute `norm(t)`, and for two such instances, the inner product `dot(t1, t2)`, provided `t1` and `t2` have the same domain and codomain.
-Furthermore, there is `normalize(t)` and `normalize!(t)` to return a scaled version of `t` with unit norm.
-These operations should also exist for `InnerProductStyle(S) <: HasInnerProduct`, but require an interface for defining a custom inner product in these spaces.
-Currently, there is no concrete subtype of `HasInnerProduct` that is not an `EuclideanProduct`.
-In particular, `CartesianSpace`, `ComplexSpace` and `GradedSpace` all have `InnerProductStyle(S) <: EuclideanProduct`.
-
-With tensors that have `InnerProductStyle(t) <: EuclideanProduct` there is associated an adjoint operation, given by `adjoint(t)` or simply `t'`, such that `domain(t') == codomain(t)` and `codomain(t') == domain(t)`.
-Note that for an instance `t::TensorMap{S, N₁, N₂}`, `t'` is simply stored in a wrapper called `AdjointTensorMap{S, N₂, N₁}`, which is another subtype of `AbstractTensorMap`.
-This should be mostly invisible to the user, as all methods should work for this type as well.
-It can be hard to reason about the index order of `t'`, i.e. index `i` of `t` appears in `t'` at index position `j = TensorKit.adjointtensorindex(t, i)`, where the latter method is typically not necessary and hence unexported.
-There is also a plural `TensorKit.adjointtensorindices` to convert multiple indices at once.
-Note that, because the adjoint interchanges domain and codomain, we have `space(t', j) == space(t, i)'`.
-
-`AbstractTensorMap` instances can furthermore be tested for exact (`t1 == t2`) or approximate (`t1 ≈ t2`) equality, though the latter requires that `norm` can be computed.
-
-When tensor map instances are endomorphisms, i.e. they have the same domain and codomain, there is a multiplicative identity which can be obtained as `one(t)` or `one!(t)`, where the latter overwrites the contents of `t`.
-The multiplicative identity on a space `V` can also be obtained using `id(A, V)` as discussed [above](@ref ss_tensor_construction), such that for a general homomorphism `t′`, we have `t′ == id(codomain(t′)) * t′ == t′ * id(domain(t′))`.
-Returning to the case of endomorphisms `t`, we can compute the trace via `tr(t)` and exponentiate them using `exp(t)`, or if the contents of `t` can be destroyed in the process, `exp!(t)`.
-Furthermore, there are a number of tensor factorizations for both endomorphisms and general homomorphism that we discuss below.
-
-Finally, there are a number of operations that also belong in this paragraph because of their analogy to common matrix operations.
-The tensor product of two `TensorMap` instances `t1` and `t2` is obtained as `t1 ⊗ t2` and results in a new `TensorMap` with `codomain(t1 ⊗ t2) = codomain(t1) ⊗ codomain(t2)` and `domain(t1 ⊗ t2) = domain(t1) ⊗ domain(t2)`.
-If we have two `TensorMap{T, S, N, 1}` instances `t1` and `t2` with the same codomain, we can combine them in a way that is analogous to `hcat`, i.e. we stack them such that the new tensor `catdomain(t1, t2)` has also the same codomain, but has a domain which is `domain(t1) ⊕ domain(t2)`.
-Similarly, if `t1` and `t2` are of type `TensorMap{T, S, 1, N}` and have the same domain, the operation `catcodomain(t1, t2)` results in a new tensor with the same domain and a codomain given by `codomain(t1) ⊕ codomain(t2)`, which is the analogy of `vcat`.
-Note that direct sum only makes sense between `ElementarySpace` objects, i.e. there is no way to give a tensor product meaning to a direct sum of tensor product spaces.
-
-Time for some more examples:
-```@repl tensors
-using TensorKit # hide
-V1 = ℂ^2
-t = randn(V1 ← V1 ⊗ V1 ⊗ V1)
-t == t + zero(t) == t * id(domain(t)) == id(codomain(t)) * t
-t2 = randn(ComplexF64, codomain(t), domain(t));
-dot(t2, t)
-tr(t2' * t)
-dot(t2, t) ≈ dot(t', t2')
-dot(t2, t2)
-norm(t2)^2
-t3 = copy!(similar(t, ComplexF64), t);
-t3 == t
-rmul!(t3, 0.8);
-t3 ≈ 0.8 * t
-axpby!(0.5, t2, 1.3im, t3);
-t3 ≈ 0.5 * t2 + 0.8 * 1.3im * t
-t4 = randn(fuse(codomain(t)), codomain(t));
-t5 = TensorMap{Float64}(undef, fuse(codomain(t)), domain(t));
-mul!(t5, t4, t) == t4 * t
-inv(t4) * t4 ≈ id(codomain(t))
-t4 * inv(t4) ≈ id(fuse(codomain(t)))
-t4 \ (t4 * t) ≈ t
-t6 = randn(ComplexF64, V1, codomain(t));
-numout(t4) == numout(t6) == 1
-t7 = catcodomain(t4, t6);
-foreach(println, (codomain(t4), codomain(t6), codomain(t7)))
-norm(t7) ≈ sqrt(norm(t4)^2 + norm(t6)^2)
-t8 = t4 ⊗ t6;
-foreach(println, (codomain(t4), codomain(t6), codomain(t8)))
-foreach(println, (domain(t4), domain(t6), domain(t8)))
-norm(t8) ≈ norm(t4)*norm(t6)
-```
-
-## [Index manipulations](@id ss_indexmanipulation)
-
-Index manipulations are operations that reorganize the bipartition of indices between the codomain and domain, possibly also reordering them or applying braiding isomorphisms.
-They are covered in detail on a dedicated page: [Index manipulations](@ref s_indexmanipulations).
-
-## [Tensor factorizations](@id ss_tensor_factorization)
-
-As tensors are linear maps, they suport various kinds of factorizations.
-These functions all interpret the provided `AbstractTensorMap` instances as a map from `domain` to `codomain`, which can be thought of as reshaping the tensor into a matrix according to the current bipartition of the indices.
-
-TensorKit's factorizations are provided by [MatrixAlgebraKit.jl](https://github.com/QuantumKitHub/MatrixAlgebraKit.jl), which is used to supply both the interface, as well as the implementation of the various operations on the blocks of data.
-For specific details on the provided functionality, we refer to its [documentation page](https://quantumkithub.github.io/MatrixAlgebraKit.jl/stable/user_interface/decompositions/).
-
-Finally, note that each of the factorizations takes the current partition of `domain` and `codomain` as the *axis* along which to matricize and perform the factorization.
-In order to obtain factorizations according to a different bipartition of the indices, we can use any of the previously mentioned [index manipulations](@ref ss_indexmanipulation) before the factorization.
-
-Some examples to conclude this section
-```@repl tensors
-V1 = SU₂Space(0 => 2, 1/2 => 1)
-V2 = SU₂Space(0 => 1, 1/2 => 1, 1 => 1)
-
-t = randn(V1 ⊗ V1, V2);
-U, S, Vh = svd_compact(t);
-t ≈ U * S * Vh
-D, V = eigh_full(t' * t);
-D ≈ S * S
-U' * U ≈ id(domain(U))
-S
-
-Q, R = left_orth(t; alg = :svd);
-Q' * Q ≈ id(domain(Q))
-t ≈ Q * R
-
-U2, S2, Vh2, ε = svd_trunc(t; trunc = truncspace(V1));
-Vh2 * Vh2' ≈ id(codomain(Vh2))
-S2
-ε ≈ norm(block(S, Irrep[SU₂](1))) * sqrt(dim(Irrep[SU₂](1)))
-
-L, Q = right_orth(permute(t, ((1,), (2, 3))));
-codomain(L), domain(L), domain(Q)
-Q * Q'
-P = Q' * Q;
-P ≈ P * P
-t′ = permute(t, ((1,), (2, 3)));
-t′ ≈ t′ * P
-```
-
-## [Bosonic tensor contractions and tensor networks](@id ss_tensor_contraction)
-
-One of the most important operation with tensor maps is to compose them, more generally known as contracting them.
-As mentioned in the section on [category theory](@ref s_categories), a typical composition of maps in a ribbon category can graphically be represented as a planar arrangement of the morphisms (i.e. tensor maps, boxes with lines eminating from top and bottom, corresponding to source and target, i.e. domain and codomain), where the lines connecting the source and targets of the different morphisms should be thought of as ribbons, that can braid over or underneath each other, and that can twist.
-Technically, we can embed this diagram in ``ℝ × [0,1]`` and attach all the unconnected line endings corresponding objects in the source at some position ``(x,0)`` for ``x∈ℝ``, and all line endings corresponding to objects in the target at some position ``(x,1)``.
-The resulting morphism is then invariant under what is known as *framed three-dimensional isotopy*, i.e. three-dimensional rearrangements of the morphism that respect the rules of boxes connected by ribbons whose open endings are kept fixed.
-Such a two-dimensional diagram cannot easily be encoded in a single line of code.
-
-However, things simplify when the braiding is symmetric (such that over- and under- crossings become equivalent, i.e. just crossings), and when twists, i.e. self-crossings in this case, are trivial.
-This amounts to `BraidingStyle(I) == Bosonic()` in the language of TensorKit.jl, and is true for any subcategory of ``\mathbf{Vect}``, i.e. ordinary tensors, possibly with some symmetry constraint.
-The case of ``\mathbf{SVect}`` and its subcategories, and more general categories, are discussed below.
-
-In the case of trivial twists, we can deform the diagram such that we first combine every morphism with a number of coevaluations ``η`` so as to represent it as a tensor, i.e. with a trivial domain.
-We can then rearrange the morphism to be all ligned up horizontally, where the original morphism compositions are now being performed by evaluations ``ϵ``.
-This process will generate a number of crossings and twists, where the latter can be omitted because they act trivially.
-Similarly, double crossings can also be omitted.
-As a consequence, the diagram, or the morphism it represents, is completely specified by the tensors it is composed of, and which indices between the different tensors are connect, via the evaluation ``ϵ``, and which indices make up the source and target of the resulting morphism.
-If we also compose the resulting morphisms with coevaluations so that it has a trivial domain, we just have one type of unconnected lines, henceforth called open indices.
-We sketch such a rearrangement in the following picture
-
-```@raw html
-<img src="../img/tensor-bosoniccontraction.svg" alt="tensor unitary" class="color-invertible"/>
-```
-
-Hence, we can now specify such a tensor diagram, henceforth called a tensor contraction or also tensor network, using a one-dimensional syntax that mimicks [abstract index notation](https://en.wikipedia.org/wiki/Abstract_index_notation) and specifies which indices are connected by the evaluation map using Einstein's summation conventation.
-Indeed, for `BraidingStyle(I) == Bosonic()`, such a tensor contraction can take the same format as if all tensors were just multi-dimensional arrays.
-For this, we rely on the interface provided by the package [TensorOperations.jl](https://github.com/QuantumKitHub/TensorOperations.jl).
-
-The above picture would be encoded as
-```julia
-@tensor E[a, b, c, d, e] := A[v, w, d, x] * B[y, z, c, x] * C[v, e, y, b] * D[a, w, z]
-```
-or
-```julia
-@tensor E[:] := A[1, 2, -4, 3] * B[4, 5, -3, 3] * C[1, -5, 4, -2] * D[-1, 2, 5]
-```
-where the latter syntax is known as NCON-style, and labels the unconnected or outgoing indices with negative integers, and the contracted indices with positive integers.
-
-A number of remarks are in order.
-TensorOperations.jl accepts both integers and any valid variable name as dummy label for indices, and everything in between `[ ]` is not resolved in the current context but interpreted as a dummy label.
-Here, we label the indices of a `TensorMap`, like `A::TensorMap{T, S, N₁, N₂}`, in a linear fashion, where the first position corresponds to the first space in `codomain(A)`, and so forth, up to position `N₁`.
-Index `N₁ + 1` then corresponds to the first space in `domain(A)`.
-However, because we have applied the coevaluation ``η``, it actually corresponds to the corresponding dual space, in accordance with the interface of [`space(A, i)`](@ref) that we introduced [above](@ref ss_tensor_properties), and as indiated by the dotted box around ``A`` in the above picture.
-The same holds for the other tensor maps.
-Note that our convention also requires that we braid indices that we brought from the domain to the codomain, and so this is only unambiguous for a symmetric braiding, where there is a unique way to permute the indices.
-
-With the current syntax, we create a new object `E` because we use the definition operator `:=`.
-Furthermore, with the current syntax, it will be a `Tensor`, i.e. it will have a trivial domain, and correspond to the dotted box in the picture above, rather than the actual morphism `E`.
-We can also directly define `E` with the correct codomain and domain by rather using
-```julia
-@tensor E[a b c;d e] := A[v, w, d, x] * B[y, z, c, x] * C[v, e, y, b] * D[a, w, z]
-```
-or
-```julia
-@tensor E[(a, b, c);(d, e)] := A[v, w, d, x] * B[y, z, c, x] * C[v, e, y, b] * D[a, w, z]
-```
-where the latter syntax can also be used when the codomain is empty.
-When using the assignment operator `=`, the `TensorMap` `E` is assumed to exist and the contents will be written to the currently allocated memory.
-Note that for existing tensors, both on the left hand side and right hand side, trying to specify the indices in the domain and the codomain seperately using the above syntax, has no effect, as the bipartition of indices are already fixed by the existing object.
-Hence, if `E` has been created by the previous line of code, all of the following lines are now equivalent
-```julia
-@tensor E[(a, b, c);(d, e)] = A[v, w, d, x] * B[y, z, c, x] * C[v, e, y, b] * D[a, w, z]
-@tensor E[a, b, c, d, e] = A[v w d; x] * B[(y, z, c); (x, )] * C[v e y; b] * D[a, w, z]
-@tensor E[a b; c d e] = A[v; w d x] * B[y, z, c, x] * C[v, e, y, b] * D[a w; z]
-```
-and none of those will or can change the partition of the indices of `E` into its codomain and its domain.
-
-Two final remarks are in order.
-Firstly, the order of the tensors appearing on the right hand side is irrelevant, as we can reorder them by using the allowed moves of the Penrose graphical calculus, which yields some crossings and a twist.
-As the latter is trivial, it can be omitted, and we just use the same rules to evaluate the newly ordered tensor network.
-For the particular case of matrix-matrix multiplication, which also captures more general settings by appropriotely combining spaces into a single line, we indeed find
-
-```@raw html
-<img src="../img/tensor-contractionreorder.svg" alt="tensor contraction reorder" class="color-invertible"/>
-```
-
-or thus, the following two lines of code yield the same result
-```julia
-@tensor C[i, j] := B[i, k] * A[k, j]
-@tensor C[i, j] := A[k, j] * B[i, k]
-```
-Reordering of tensors can be used internally by the `@tensor` macro to evaluate the contraction in a more efficient manner.
-In particular, the NCON-style of specifying the contraction gives the user control over the order, and there are other macros, such as `@tensoropt`, that try to automate this process.
-There is also an `@ncon` macro and `ncon` function, an we recommend reading the [manual of TensorOperations.jl](https://quantumkithub.github.io/TensorOperations.jl/stable/) to learn more about the possibilities and how they work.
-
-A final remark involves the use of adjoints of tensors.
-The current framework is such that the user should not be too worried about the actual bipartition into codomain and domain of a given `TensorMap` instance.
-Indeed, for tensor contractions the `@tensor` macro figures out the correct manipulations automatically.
-However, when wanting to use the `adjoint` of an instance `t::TensorMap{T, S, N₁, N₂}`, the resulting `adjoint(t)` is an `AbstractTensorMap{T, S, N₂, N₁}` and one needs to know the values of `N₁` and `N₂` to know exactly where the `i`th index of `t` will end up in `adjoint(t)`, and hence the index order of `t'`.
-Within the `@tensor` macro, one can instead use `conj()` on the whole index expression so as to be able to use the original index ordering of `t`.
-For example, for `TensorMap{T, S, 1, 1}` instances, this yields exactly the equivalence one expects, namely one between the following two expressions:
-
-```julia
-@tensor C[i, j] := B'[i, k] * A[k, j]
-@tensor C[i, j] := conj(B[k, i]) * A[k, j]
-```
-
-For e.g. an instance `A::TensorMap{T, S, 3, 2}`, the following two syntaxes have the same effect within an `@tensor` expression: `conj(A[a, b, c, d, e])` and `A'[d, e, a, b, c]`.
-
-Some examples:
-
-## Fermionic tensor contractions
-
-TODO
-
-## Anyonic tensor contractions
-
-TODO
diff --git a/docs/src/man/tensors.md b/docs/src/man/tensors.md
index 2921e5f1b..155dbbe4d 100644
--- a/docs/src/man/tensors.md
+++ b/docs/src/man/tensors.md
@@ -5,7 +5,7 @@ using TensorKit
 using LinearAlgebra
 ```
 
-This last page explains how to create and manipulate tensors in TensorKit.jl.
+This page explains how to construct and access tensors in TensorKit.jl.
 As this is probably the most important part of the manual, we will also focus more strongly on the usage and interface, and less so on the underlying implementation.
 The only aspect of the implementation that we will address is the storage of the tensor data, as this is important to know how to create and initialize a tensor, but will in fact also shed light on how some of the methods work.
 

From c1e910eee8c46e20ca70b3533e7656e762e8989e Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Sun, 26 Apr 2026 17:32:19 -0400
Subject: [PATCH 22/33] update `adapt_transformer`

---
 ext/TensorKitAMDGPUExt/roctensormap.jl |  9 +--------
 ext/TensorKitCUDAExt/cutensormap.jl    |  9 +--------
 src/tensors/indexmanipulations.jl      | 16 +++++++++++++---
 src/tensors/treetransformers.jl        |  9 ---------
 4 files changed, 15 insertions(+), 28 deletions(-)

diff --git a/ext/TensorKitAMDGPUExt/roctensormap.jl b/ext/TensorKitAMDGPUExt/roctensormap.jl
index c65f91062..56dea938e 100644
--- a/ext/TensorKitAMDGPUExt/roctensormap.jl
+++ b/ext/TensorKitAMDGPUExt/roctensormap.jl
@@ -163,11 +163,4 @@ for f in (:sqrt, :log, :asin, :acos, :acosh, :atanh, :acoth)
     end
 end
 
-function TensorKit.adapt_transformer(
-        t::TensorKit.GenericTreeTransformer, data::ROCVector
-    )
-    new_data = map(t.data) do (U, structs_dst, structs_src)
-        return AMDGPU.Adapt.adapt(ROCArray, U), structs_dst, structs_src
-    end
-    return TensorKit.GenericTreeTransformer(new_data)
-end
+TensorKit.adapt_transformer(U::AbstractMatrix, ::Type{A}) where {A <: ROCVector} = AMDGPU.Adapt.adapt(ROCArray, U)
diff --git a/ext/TensorKitCUDAExt/cutensormap.jl b/ext/TensorKitCUDAExt/cutensormap.jl
index aec9b2313..4fa8c0dea 100644
--- a/ext/TensorKitCUDAExt/cutensormap.jl
+++ b/ext/TensorKitCUDAExt/cutensormap.jl
@@ -156,11 +156,4 @@ for f in (:sqrt, :log, :asin, :acos, :acosh, :atanh, :acoth)
     end
 end
 
-function TensorKit.adapt_transformer(
-        t::TensorKit.GenericTreeTransformer, data::CuVector
-    )
-    new_data = map(t.data) do (U, structs_dst, structs_src)
-        return CUDA.Adapt.adapt(CuArray, U), structs_dst, structs_src
-    end
-    return TensorKit.GenericTreeTransformer(new_data)
-end
+TensorKit.adapt_transformer(U::AbstractMatrix, ::Type{A}) where {A <: CuVector} = CUDA.CUDACore.Adapt.adapt(CuArray, U)
diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl
index 84865d5e4..e66ddf195 100644
--- a/src/tensors/indexmanipulations.jl
+++ b/src/tensors/indexmanipulations.jl
@@ -624,7 +624,8 @@ function add_transform_kernel!(
                         ptriv, false, One(), Zero(), backend, allocator
                     )
                 end
-                mul!(buffer_dst, buffer_src, transpose(StridedView(U)))
+                U′ = adapt_transformer(U, storagetype(tdst))
+                mul!(buffer_dst, buffer_src, transpose(StridedView(U′)))
                 @inbounds for (i, (f₃, f₄)) in enumerate(fusiontrees(dst))
                     TO.tensoradd!(
                         tdst[f₃, f₄], sreshape(buffer_dst[:, i], sz_src),
@@ -661,7 +662,6 @@ function add_transform_kernel!(
         data_dst::DenseVector, data_src::DenseVector, p, transformer::GenericTreeTransformer,
         α, β, backend, allocator, scheduler
     )
-    transformer = adapt_transformer(transformer, data_dst)
     # Each entry covers one fusion block:
     #   U            — recoupling matrix (rows = dst trees, cols = src trees)
     #   sz_{dst,src} — array shape of each block (same for all trees in the block)
@@ -697,7 +697,8 @@ function add_transform_kernel!(
 
             # 2. Recoupling: buffer_dst = buffer_src * U^T  (each output tree is a linear
             #    combination of input trees weighted by the recoupling coefficients).
-            mul!(buffer_dst, buffer_src, transpose(StridedView(U)))
+            U′ = adapt_transformer(U, typeof(data_dst))
+            mul!(buffer_dst, buffer_src, transpose(StridedView(U′)))
 
             # 3. Insert: scatter column i of buffer_dst into the destination, applying the
             #    actual index permutation p in the same tensoradd! call.
@@ -713,3 +714,12 @@ function add_transform_kernel!(
     TO.allocator_reset!(allocator, cp)
     return nothing
 end
+
+"""
+    adapt_transformer(U::AbstractMatrix, ::Type{A})
+
+Return a version of the basis transformation `U` that is compatible for storage type `A`.
+Default is a no-op.
+Backends (e.g. CUDA, AMDGPU) should overload this for their vector types to ensure the recoupling matrix `U` is on the correct device.
+"""
+adapt_transformer(U::AbstractMatrix, ::Type{A}) where {A} = U
diff --git a/src/tensors/treetransformers.jl b/src/tensors/treetransformers.jl
index 664f8a0d6..aaecbe746 100644
--- a/src/tensors/treetransformers.jl
+++ b/src/tensors/treetransformers.jl
@@ -203,12 +203,3 @@ end
 function _transformer_weight((mat, structs_dst, structs_src)::GenericTransformerData)
     return length(mat) * prod(structs_dst[1])
 end
-
-"""
-    adapt_transformer(transformer::TreeTransformer, data::AbstractVector)
-
-Return a version of `transformer` whose internal arrays are compatible with `data`.
-Default is a no-op. Backends (e.g. CUDA, AMDGPU) should overload this for their vector types
-to ensure the recoupling matrix `U` inside `GenericTreeTransformer` is on the correct device.
-"""
-adapt_transformer(t::TreeTransformer, ::AbstractVector) = t

From fc323def3c721c43b2398976dcc81c2543cb53e1 Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Tue, 28 Apr 2026 16:40:59 -0400
Subject: [PATCH 23/33] multithreading is hard -- race conditions are easy...

---
 src/tensors/indexmanipulations.jl | 118 +++++++++++++++++-------------
 src/tensors/treetransformers.jl   |   6 ++
 2 files changed, 75 insertions(+), 49 deletions(-)

diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl
index e66ddf195..f10d2bc71 100644
--- a/src/tensors/indexmanipulations.jl
+++ b/src/tensors/indexmanipulations.jl
@@ -584,59 +584,68 @@ function add_transform_kernel!(
         tforeach(fusiontrees(tsrc); scheduler) do (f₁, f₂)
             (f₁′, f₂′), coeff = transformer((f₁, f₂))
             @inbounds TO.tensoradd!(
-                tdst[f₁′, f₂′], tsrc[f₁, f₂],
-                p, false, α * coeff, β, backend, allocator
+                tdst[f₁′, f₂′], tsrc[f₁, f₂], p, false, α * coeff, β, backend, allocator
             )
         end
-    else
-        cp = TO.allocator_checkpoint!(allocator)
-        # Non-Abelian fusion: trees sharing the same set of uncoupled (external) sectors
-        # form a *fusion block* and mix under the transformation via a recoupling matrix U
-        # (rows = destination trees, columns = source trees). We iterate over blocks.
-        tforeach(fusionblocks(tsrc); scheduler) do src
-            dst, U = transformer(src)
-            if length(src) == 1
-                # Degenerate block: single tree, U is a 1×1 scalar — skip the buffer + matmul.
-                (f₁, f₂) = only(fusiontrees(src))
-                (f₁′, f₂′) = only(fusiontrees(dst))
-                @inbounds TO.tensoradd!(
-                    tdst[f₁′, f₂′], tsrc[f₁, f₂],
-                    p, false, α * only(U), β, backend, allocator
+        return nothing
+    end
+    cp = TO.allocator_checkpoint!(allocator)
+    # Non-Abelian fusion: trees sharing the same set of uncoupled (external) sectors
+    # form a *fusion block* and mix under the transformation via a recoupling matrix U
+    # (rows = destination trees, columns = source trees). We iterate over blocks
+
+    # buffers have to be created without race condition: err on the side of caution
+    buffersz = 2 * length(space(tdst))
+    generate_buffer = let lock = Threads.ReentrantLock(), allocator = allocator
+        () -> @lock lock TO.tensoralloc(typeof(data_dst), buffersz, Val(true), allocator)
+    end
+
+    OhMyThreads.@tasks for src in fusionblocks(tsrc)
+        # setup
+        OhMyThreads.@set scheduler = scheduler
+        OhMyThreads.@local buffer = generate_buffer()
+
+        dst, U = transformer(src)
+
+        if length(src) == 1
+            # Degenerate block: single tree, U is a 1×1 scalar — skip the buffer + matmul.
+            (f₁, f₂) = only(fusiontrees(src))
+            (f₁′, f₂′) = only(fusiontrees(dst))
+            @inbounds TO.tensoradd!(
+                tdst[f₁′, f₂′], tsrc[f₁, f₂], p, false, α * only(U), β, backend, allocator
+            )
+        else
+            # Multi-tree block: apply recoupling via a three-step pack → matmul → unpack.
+            #   1. Extract: flatten each source block into a column of buffer_src
+            #      (shape blocksize × cols), using a trivial permutation so that the
+            #      index layout is canonical before the matmul.
+            #   2. Recoupling: buffer_dst = buffer_src * U^T  (blocksize × rows)
+            #   3. Insert: scatter columns of buffer_dst to destination blocks,
+            #      applying the actual permutation p in the same step.
+            rows, cols = size(U)
+            sz_src = size(tsrc[first(fusiontrees(src))...])
+            blocksize = prod(sz_src)
+            ptriv = (ntuple(identity, length(sz_src)), ())
+            buffer_dst = StridedView(buffer, (blocksize, rows), (1, blocksize), 0)
+            buffer_src = StridedView(buffer, (blocksize, cols), (1, blocksize), blocksize * rows)
+            @inbounds for (i, (f₁, f₂)) in enumerate(fusiontrees(src))
+                TO.tensoradd!(
+                    sreshape(buffer_src[:, i], sz_src), tsrc[f₁, f₂],
+                    ptriv, false, One(), Zero(), backend, allocator
                 )
-            else
-                # Multi-tree block: apply recoupling via a three-step pack → matmul → unpack.
-                #   1. Extract: flatten each source block into a column of buffer_src
-                #      (shape blocksize × cols), using a trivial permutation so that the
-                #      index layout is canonical before the matmul.
-                #   2. Recoupling: buffer_dst = buffer_src * U^T  (blocksize × rows)
-                #   3. Insert: scatter columns of buffer_dst to destination blocks,
-                #      applying the actual permutation p in the same step.
-                rows, cols = size(U)
-                sz_src = size(tsrc[first(fusiontrees(src))...])
-                blocksize = prod(sz_src)
-                ptriv = (ntuple(identity, length(sz_src)), ())
-                buffer = TO.tensoralloc(storagetype(tdst), blocksize * (rows + cols), Val(true), allocator)
-                buffer_dst = StridedView(buffer, (blocksize, rows), (1, blocksize), 0)
-                buffer_src = StridedView(buffer, (blocksize, cols), (1, blocksize), blocksize * rows)
-                @inbounds for (i, (f₁, f₂)) in enumerate(fusiontrees(src))
-                    TO.tensoradd!(
-                        sreshape(buffer_src[:, i], sz_src), tsrc[f₁, f₂],
-                        ptriv, false, One(), Zero(), backend, allocator
-                    )
-                end
-                U′ = adapt_transformer(U, storagetype(tdst))
-                mul!(buffer_dst, buffer_src, transpose(StridedView(U′)))
-                @inbounds for (i, (f₃, f₄)) in enumerate(fusiontrees(dst))
-                    TO.tensoradd!(
-                        tdst[f₃, f₄], sreshape(buffer_dst[:, i], sz_src),
-                        p, false, α, β, backend, allocator
-                    )
-                end
-                TO.tensorfree!(buffer, allocator)
             end
+            U′ = adapt_transformer(U, storagetype(tdst))
+            mul!(buffer_dst, buffer_src, transpose(StridedView(U′)))
+            @inbounds for (i, (f₃, f₄)) in enumerate(fusiontrees(dst))
+                TO.tensoradd!(
+                    tdst[f₃, f₄], sreshape(buffer_dst[:, i], sz_src),
+                    p, false, α, β, backend, allocator
+                )
+            end
+            TO.tensorfree!(buffer, allocator)
         end
-        TO.allocator_reset!(allocator, cp)
     end
+    TO.allocator_reset!(allocator, cp)
     return nothing
 end
 
@@ -667,7 +676,19 @@ function add_transform_kernel!(
     #   sz_{dst,src} — array shape of each block (same for all trees in the block)
     #   structs_{dst,src}[i] — (offset, strides) into the flat data vector for tree i
     cp = TO.allocator_checkpoint!(allocator)
-    tforeach(transformer.data; scheduler) do (U, (sz_dst, structs_dst), (sz_src, structs_src))
+
+    # buffers have to be created without race condition: err on the side of caution
+    buffersz = 2 * buffersize(transformer)
+    generate_buffer = let lock = Threads.ReentrantLock(), allocator = allocator
+        () -> @lock lock TO.tensoralloc(typeof(data_dst), buffersz, Val(true), allocator)
+    end
+
+    OhMyThreads.@tasks for subtransformer in transformer.data
+        # setup
+        OhMyThreads.@set scheduler = scheduler
+        OhMyThreads.@local buffer = generate_buffer()
+        U, (sz_dst, structs_dst), (sz_src, structs_src) = subtransformer
+
         if length(U) == 1
             # Degenerate block with a single tree: no matmul needed.
             coeff = only(U)
@@ -682,7 +703,6 @@ function add_transform_kernel!(
             rows, cols = size(U)
             blocksize = prod(sz_src)
             ptriv = (ntuple(identity, length(sz_src)), ())
-            buffer = TO.tensoralloc(typeof(data_dst), blocksize * (rows + cols), Val(true), allocator)
             buffer_dst = StridedView(buffer, (blocksize, rows), (1, blocksize), 0)
             buffer_src = StridedView(buffer, (blocksize, cols), (1, blocksize), blocksize * rows)
 
diff --git a/src/tensors/treetransformers.jl b/src/tensors/treetransformers.jl
index aaecbe746..f68378cc2 100644
--- a/src/tensors/treetransformers.jl
+++ b/src/tensors/treetransformers.jl
@@ -203,3 +203,9 @@ end
 function _transformer_weight((mat, structs_dst, structs_src)::GenericTransformerData)
     return length(mat) * prod(structs_dst[1])
 end
+
+function buffersize(transformer::GenericTreeTransformer)
+    return maximum(transformer.data; init = 0) do (basistransform, structures_dst, _)
+        return prod(structures_dst[1]) * size(basistransform, 1)
+    end
+end

From 3da7cb39199bf005cadfaf65f534b8ecfba8c27b Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Tue, 12 May 2026 08:45:03 -0400
Subject: [PATCH 24/33] merge fixes

---
 src/tensors/braidingtensor.jl | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/src/tensors/braidingtensor.jl b/src/tensors/braidingtensor.jl
index a3a131c3d..f567a3a2b 100644
--- a/src/tensors/braidingtensor.jl
+++ b/src/tensors/braidingtensor.jl
@@ -225,7 +225,7 @@ function planarcontract!(
 
     # Non-bosonic case: factor into a cyclic transpose (no crossings) + a single Artin braid
     # that swaps the two contracted legs, producing the R-symbol that A encodes. Naively
-    # using a single `add_braid!` is wrong: it would resolve cyclic moves as crossings and
+    # using a single `braid!` is wrong: it would resolve cyclic moves as crossings and
     # pick up spurious R-symbol factors.
     B_in_layout = (cindB == codB && oindB == domB)
     if B_in_layout
@@ -274,11 +274,11 @@ function planarcontract!(
 
     I = sectortype(C)
     BraidingStyle(I) isa Bosonic &&
-        return add_permute!(C, A, (oindA, reverse(cindA)), α, β, backend)
+        return permute!(C, A, (oindA, reverse(cindA)), α, β, backend, allocator)
 
     # Non-bosonic case: cyclic transpose A → (oindA, cindA) (no crossings), then a single
     # Artin braid swaps A′'s last two indices, producing the R-symbol that B encodes. Naively
-    # using a single `add_braid!` is wrong: it would resolve cyclic moves as crossings and
+    # using a single `braid!` is wrong: it would resolve cyclic moves as crossings and
     # pick up spurious R-symbol factors.
 
     A_in_layout = (oindA == codA && cindA == domA)
@@ -291,10 +291,21 @@ function planarcontract!(
         transpose!(A′, A, (oindA, cindA), One(), Zero(), backend, allocator)
     end
 
-    p = (oindA, reverse(cindA))
-    N = length(oindA)
-    levels = (ntuple(identity, N)..., (B.adjoint ? (N + 1, N + 2) : (N + 2, N + 1))...)
-    return braid!(C, A, p, levels, α, β, backend, allocator)
+    levelsB = B.adjoint ? (1, 2, 2, 1) : (2, 1, 1, 2)
+    N = numind(A)
+    M = N - 2
+    levels = (
+        ntuple(Returns(3), M)...,
+        levelsB[cindB[1]], levelsB[cindB[2]],
+    )
+
+    braid!(
+        C, A′, (ntuple(identity, M), (N, N - 1)),
+        levels, α, β, backend, allocator
+    )
+
+    A_in_layout || TO.tensorfree!(A′, allocator)
+    return C
 end
 
 # ambiguity fix:

From cd4e95054f1b11109ddb9f83ddbf94dc78ea2f56 Mon Sep 17 00:00:00 2001
From: Lukas Devos <ldevos98@gmail.com>
Date: Sat, 16 May 2026 11:14:52 -0400
Subject: [PATCH 25/33] Apply suggestions from code review

Co-authored-by: Jutho <Jutho@users.noreply.github.com>
---
 docs/src/man/indexmanipulations.md | 44 ++++++++++++----------------
 src/tensors/indexmanipulations.jl  | 46 ++++++++++++++++++------------
 2 files changed, 47 insertions(+), 43 deletions(-)

diff --git a/docs/src/man/indexmanipulations.md b/docs/src/man/indexmanipulations.md
index 8379aac3d..b644eff0c 100644
--- a/docs/src/man/indexmanipulations.md
+++ b/docs/src/man/indexmanipulations.md
@@ -10,7 +10,7 @@ using LinearAlgebra
 ```
 
 A `TensorMap{T, S, N₁, N₂}` is a linear map from a domain (a `ProductSpace{S, N₂}`) to a codomain (a `ProductSpace{S, N₁}`).
-In practice, the bipartition of the `N₁ + N₂` indices between domain and codomain is often not fixed: algorithms typically need to reshuffle indices between the two sides, reorder them, or change the arrow direction on individual indices before passing a tensor to a factorization or contraction.
+In practice, the bipartition of the `N₁ + N₂` indices between domain and codomain rarely remains fixed: algorithms typically need to reshuffle indices between the two sides, reorder them, or change the arrow direction on individual indices before passing a tensor to a factorization or contraction.
 
 Index manipulations cover all such operations.
 They act on the structure of the tensor data in a way that is fully determined by the categorical data of the `sectortype`, such that TensorKit automatically manipulates the tensor entries accordingly.
@@ -36,21 +36,25 @@ allind
 ## Reweighting
 
 Reweighting operations modify the entries of a tensor by applying local isomorphisms to individual indices, without changing the number of indices or their partition between domain and codomain.
-
-[`flip`](@ref) changes the arrow direction on selected indices by applying the corresponding isomorphism between a space and its dual.
-[`twist`](@ref) applies the topological spin (monoidal twist) to selected indices; for `BraidingStyle(I) == Bosonic()` this is always trivial.
+In particular, [`twist`](@ref) applies the topological spin (monoidal twist) to selected indices; this operation preserves the space of the indices and is completely trivial for `BraidingStyle(I) == Bosonic()`.
+In contrast, [`flip`](@ref) changes the arrow direction on selected indices by applying a (non-canonical!) isomorphism between the index space and its dual.
 
 ```@docs; canonical=false
-flip(t::AbstractTensorMap, I)
 twist(::AbstractTensorMap, ::Int)
 twist!
+flip(t::AbstractTensorMap, I)
 ```
 
 ## Inserting and removing unit spaces
 
-These functions add or remove a trivial tensor product factor at a specified index position, without affecting any other indices.
-[`insertleftunit`](@ref) inserts before position `i` and [`insertrightunit`](@ref) inserts after position `i`; [`removeunit`](@ref) undoes either insertion.
-Passing `Val(i)` instead of an `Int` for the position may improve type stability.
+The next set of functions add or remove a trivial tensor product factor at a specified index position, without affecting any other indices.
+We distinguish between [`insertleftunit`](@ref), which inserts a unit index before index `i` (the unit index becoming index `i`),
+and [`insertrightunit`](@ref), which inserts after index `i` (the unit index becoming index `i + 1`);
+[`removeunit`](@ref) undoes either insertion.
+
+For tensors `t` with `UnitStyle(sectortype(t)) = SimpleUnit()`, the only relevant difference between `insertleftunit(t, i + 1)` and `insertrightunit(t, i)` is that `insertleftunit(t, numout(t) + 1)` inserts the unit index as first index in the domain, whereas `insertrightunit(t, numout(t))` will insert the unit index as last index in the codomain. 
+
+Passing `Val(i)` instead of an integer `i` for the position may improve type stability.
 
 ```@docs; canonical=false
 insertleftunit(::AbstractTensorMap, ::Val{i}) where {i}
@@ -91,24 +95,14 @@ repartition!
 
 ## Fusing and splitting indices
 
-There is no dedicated function for fusing or splitting indices.
-For a plain tensor (`sectortype(t) == Trivial`), this is equivalent to `reshape` on the underlying array.
-
+There is no dedicated functionality for fusing or splitting indices.
 In the general case there is no canonical embedding of `V1 ⊗ V2` into the fused space `V = fuse(V1 ⊗ V2)`: any two such embeddings differ by a basis transform, i.e. there is a gauge freedom.
-TensorKit resolves this by requiring the user to construct an explicit isomorphism — the *fuser* — and contract it with the tensor:
-
-```julia
-f = unitary(fuse(space(t, i) ⊗ space(t, j)), space(t, i) ⊗ space(t, j))
-@tensor t_fused[…, a, …] := f[a, i, j] * t[…, i, j, …]
-```
+TensorKit resolves this by requiring the user to construct an explicit isomorphism — the *fuser* — and contract it with the tensor.
+One particular isomorphism can be constructed using the [`unitary](@ref) function.
+It preserves norms and inner products, and has an inverse given by its adjoint. 
+For a plain tensor (`sectortype(t) == Trivial`), applying this particular `unitary` is equivalent to `reshape` on the underlying array.
 
-Splitting is then the adjoint of the same map:
-
-```julia
-@tensor t_split[…, i, j, …] := f'[i, j, a] * t_fused[…, a, …]
-```
+Fusing index `i` and `j = i+1` of a tensor `t` is then accomplished as
 
-Using `f'` as the splitter guarantees that the round-trip is the identity, i.e. `t_split == t`.
-Using a *different* isomorphism to split would give a physically equivalent but numerically different tensor, so it is important to keep `f` and its adjoint consistent throughout a calculation.
 
-Note that tensor factorizations (SVD, QR, etc.) can be applied directly to any index bipartition without needing to fuse indices first; see [Tensor factorizations](@ref ss_tensor_factorization).
+The resulting `unitary` is a dense `TensorMap`, and this fusion and splitting approach is not optimized for maximal performance. However, because many tensor operations including tensor factorizations (SVD, QR, etc.) can be applied without needing any fusion, we do not expect fusion and splitting to be an essential part of performance critical parts of typical tensor algorithms. 
diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl
index f10d2bc71..3fef555e9 100644
--- a/src/tensors/indexmanipulations.jl
+++ b/src/tensors/indexmanipulations.jl
@@ -92,8 +92,7 @@ See [`twist!`](@ref) for storing the result in place.
 """
 function twist(t::AbstractTensorMap, inds; inv::Bool = false, copy::Bool = false)
     if has_shared_twist(t, inds)
-        copy || return t
-        return copy!(similar(t), t)
+        return copy ? Base.copy(t) : t
     end
     tdst = similar(t, promote_twist(t))
     copy!(tdst, t)
@@ -114,7 +113,12 @@ end
 Insert a trivial vector space, isomorphic to the underlying field, at position `i`,
 which can be specified as an `Int` or as `Val(i)` for improved type stability.
 More specifically, adds a left monoidal unit or its dual.
-
+Insert a trivial vector space, isomorphic to the underlying field, before position `i`,
+which should satisfy `1 ≤ i ≤ numind(t) + 1`
+and can be specified as an `Int` or as `Val(i)` for improved type stability,
+More specifically, add a left monoidal unit (or its dual) of the space associated with index `i`.
+The new index appears at position `i` in the new tensor,
+namely in its codomain for `1 ≤ i ≤ numout(t)` and in its domain otherwise.
 If `copy=false`, `tdst` might share data with `tsrc` whenever possible. Otherwise, a copy is always made.
 
 See also [`insertrightunit`](@ref insertrightunit(::AbstractTensorMap, ::Val{i}) where {i}),
@@ -143,8 +147,11 @@ end
         ) -> tdst
 
 Insert a trivial vector space, isomorphic to the underlying field, after position `i`,
-which can be specified as an `Int` or as `Val(i)` for improved type stability.
-More specifically, adds a right monoidal unit or its dual.
+which should satisfy `0 ≤ i ≤ numind(t)`
+and can be specified as an `Int` or as `Val(i)` for improved type stability,
+More specifically, add a right monoidal unit (or its dual) of the space associated with index `i`.
+The new index appears at position `i+1` in the new tensor,
+namely in its codomain for `0 ≤ i ≤ numout(t)` and in its domain otherwise.
 
 If `copy=false`, `tdst` might share data with `tsrc` whenever possible. Otherwise, a copy is always made.
 
@@ -221,7 +228,10 @@ See also [`permute`](@ref) for creating a new tensor.
 end
 
 """
-    permute(tsrc, (p₁, p₂)::Index2Tuple; copy = false, [backend], [allocator]) -> tdst
+    permute(
+        tsrc, (p₁, p₂)::Index2Tuple; copy = false,
+        backend = DefaultBackend(), allocator = DefaultAllocator()
+    ) -> tdst::TensorMap    
 
 Return tensor `tdst` obtained by permuting the indices of `tsrc`.
 The codomain and domain of `tdst` correspond to the indices in `p₁` and `p₂` of `tsrc` respectively.
@@ -307,9 +317,9 @@ end
 
 """
     braid(
-            tsrc, (p₁, p₂)::Index2Tuple, levels::IndexTuple; copy = false,
-            backend = DefaultBackend(), allocator = DefaultAllocator()
-        ) -> tdst::TensorMap
+        tsrc, (p₁, p₂)::Index2Tuple, levels::IndexTuple; copy = false,
+        backend = DefaultBackend(), allocator = DefaultAllocator()
+    ) -> tdst::TensorMap
 
 Return tensor `tdst` obtained by braiding the indices of `tsrc`.
 The codomain and domain of `tdst` correspond to the indices in `p₁` and `p₂` of `tsrc` respectively.
@@ -376,9 +386,9 @@ end
 
 """
     transpose(
-            tsrc, (p₁, p₂)::Index2Tuple; copy = false,
-            backend = DefaultBackend(), allocator = DefaultAllocator()
-        ) -> tdst::TensorMap
+        tsrc, (p₁, p₂)::Index2Tuple; copy = false,
+        backend = DefaultBackend(), allocator = DefaultAllocator()
+    ) -> tdst::TensorMap
 
 Return tensor `tdst` obtained by transposing the indices of `tsrc`.
 The codomain and domain of `tdst` correspond to the indices in `p₁` and `p₂` of `tsrc` respectively.
@@ -433,9 +443,9 @@ See also [`repartition`](@ref) for creating a new tensor.
     check_spacetype(tdst, tsrc)
     numind(tsrc) == numind(tdst) ||
         throw(ArgumentError("tsrc and tdst should have an equal amount of indices"))
-    all_inds = (codomainind(tsrc)..., reverse(domainind(tsrc))...)
-    p₁ = ntuple(i -> all_inds[i], numout(tdst))
-    p₂ = reverse(ntuple(i -> all_inds[i + numout(tdst)], numin(tdst)))
+    p₁, p₂ = let all_inds = (codomainind(tsrc)..., reverse(domainind(tsrc))...)
+        ntuple(i -> all_inds[i], numout(tdst)), reverse(ntuple(i -> all_inds[i + numout(tdst)], numin(tdst)))
+    end
     return transpose!(tdst, tsrc, (p₁, p₂), α, β, backend, allocator)
 end
 
@@ -460,9 +470,9 @@ See also [`repartition!`](@ref) for writing into an existing destination.
     )
     N₁ + N₂ == numind(t) ||
         throw(ArgumentError("Invalid repartition: $(numind(t)) to ($N₁, $N₂)"))
-    all_inds = (codomainind(t)..., reverse(domainind(t))...)
-    p₁ = ntuple(i -> all_inds[i], N₁)
-    p₂ = reverse(ntuple(i -> all_inds[i + N₁], N₂))
+    p₁, p₂ = let all_inds = (codomainind(t)..., reverse(domainind(t))...)
+        ntuple(i -> all_inds[i], N₁), reverse(ntuple(i -> all_inds[i + N₁], N₂))
+    end
     return transpose(t, (p₁, p₂); copy, backend, allocator)
 end
 

From d60c22dafdaadc1d4641cb8341de6aa30079900a Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Tue, 19 May 2026 08:34:11 -0400
Subject: [PATCH 26/33] fix variable name

[skip ci]
---
 src/tensors/indexmanipulations.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl
index 3fef555e9..ebabbf9b8 100644
--- a/src/tensors/indexmanipulations.jl
+++ b/src/tensors/indexmanipulations.jl
@@ -607,7 +607,7 @@ function add_transform_kernel!(
     # buffers have to be created without race condition: err on the side of caution
     buffersz = 2 * length(space(tdst))
     generate_buffer = let lock = Threads.ReentrantLock(), allocator = allocator
-        () -> @lock lock TO.tensoralloc(typeof(data_dst), buffersz, Val(true), allocator)
+        () -> @lock lock TO.tensoralloc(storagetype(tdst), buffersz, Val(true), allocator)
     end
 
     OhMyThreads.@tasks for src in fusionblocks(tsrc)

From ac8f8beb997fe4b08a1f154b14791f65d63942a2 Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Tue, 19 May 2026 08:48:56 -0400
Subject: [PATCH 27/33] remove double error check

---
 src/tensors/indexmanipulations.jl | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl
index ebabbf9b8..1d60ae32e 100644
--- a/src/tensors/indexmanipulations.jl
+++ b/src/tensors/indexmanipulations.jl
@@ -68,9 +68,6 @@ function twist!(t::AbstractTensorMap, inds; inv::Bool = false)
         throw(ArgumentError("Can't in-place twist a real tensor with complex sector type"))
     has_shared_twist(t, inds) && return t
 
-    (scalartype(t) <: Real && !(sectorscalartype(sectortype(t)) <: Real)) &&
-        throw(ArgumentError("No in-place `twist!` for a real tensor with complex sector type"))
-
     N₁ = numout(t)
     for (f₁, f₂) in fusiontrees(t)
         θ = prod(i -> i <= N₁ ? twist(f₁.uncoupled[i]) : twist(f₂.uncoupled[i - N₁]), inds)

From 4a7f503730271a680822562765e376a94377139b Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Tue, 19 May 2026 09:05:02 -0400
Subject: [PATCH 28/33] immediately adapt treetransformers

---
 Project.toml                             |  3 +--
 ext/TensorKitAMDGPUExt/roctensormap.jl   |  2 --
 ext/TensorKitAdaptExt.jl                 | 26 ------------------------
 ext/TensorKitCUDAExt/TensorKitCUDAExt.jl |  2 ++
 ext/TensorKitCUDAExt/cutensormap.jl      |  2 --
 ext/TensorKitCUDAExt/truncation.jl       |  6 +++---
 src/TensorKit.jl                         |  2 ++
 src/tensors/adjoint.jl                   |  2 ++
 src/tensors/braidingtensor.jl            |  8 ++++++++
 src/tensors/diagonal.jl                  |  5 +++++
 src/tensors/indexmanipulations.jl        | 17 ++++------------
 src/tensors/tensor.jl                    |  5 +++++
 12 files changed, 32 insertions(+), 48 deletions(-)
 delete mode 100644 ext/TensorKitAdaptExt.jl

diff --git a/Project.toml b/Project.toml
index 75db02f08..668ec1cf2 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,6 +4,7 @@ version = "0.17.0"
 authors = ["Jutho Haegeman, Lukas Devos"]
 
 [deps]
+Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 Dictionaries = "85a47980-9c8c-11e8-2b9f-f7ca1fa99fb4"
 LRUCache = "8ac3fa9e-de4c-5943-b1dc-09c6b5f20637"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
@@ -19,7 +20,6 @@ TupleTools = "9d95972d-f1c8-5527-a6e0-b4b365fa01f6"
 VectorInterface = "409d34a3-91d5-4945-b6ec-7529ddf182d8"
 
 [weakdeps]
-Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
@@ -28,7 +28,6 @@ Mooncake = "da2b9cff-9c12-43a0-ae48-6db2b0edb7d6"
 cuTENSOR = "011b41b2-24ef-40a8-b3eb-fa098493e9e1"
 
 [extensions]
-TensorKitAdaptExt = "Adapt"
 TensorKitAMDGPUExt = "AMDGPU"
 TensorKitCUDAExt = ["CUDA", "cuTENSOR"]
 TensorKitChainRulesCoreExt = "ChainRulesCore"
diff --git a/ext/TensorKitAMDGPUExt/roctensormap.jl b/ext/TensorKitAMDGPUExt/roctensormap.jl
index 56dea938e..f2f094c60 100644
--- a/ext/TensorKitAMDGPUExt/roctensormap.jl
+++ b/ext/TensorKitAMDGPUExt/roctensormap.jl
@@ -162,5 +162,3 @@ for f in (:sqrt, :log, :asin, :acos, :acosh, :atanh, :acoth)
         return tf
     end
 end
-
-TensorKit.adapt_transformer(U::AbstractMatrix, ::Type{A}) where {A <: ROCVector} = AMDGPU.Adapt.adapt(ROCArray, U)
diff --git a/ext/TensorKitAdaptExt.jl b/ext/TensorKitAdaptExt.jl
deleted file mode 100644
index 4d2693b7b..000000000
--- a/ext/TensorKitAdaptExt.jl
+++ /dev/null
@@ -1,26 +0,0 @@
-module TensorKitAdaptExt
-
-using TensorKit
-using TensorKit: AdjointTensorMap
-using Adapt
-
-function Adapt.adapt_structure(to, x::TensorMap)
-    data′ = adapt(to, x.data)
-    return TensorMap{eltype(data′)}(data′, space(x))
-end
-function Adapt.adapt_structure(to, x::AdjointTensorMap)
-    return adjoint(adapt(to, parent(x)))
-end
-function Adapt.adapt_structure(to, x::DiagonalTensorMap)
-    data′ = adapt(to, x.data)
-    return DiagonalTensorMap(data′, x.domain)
-end
-function Adapt.adapt_structure(::Type{T}, x::BraidingTensor{T′, S, A}) where {T <: Number, T′, S, A}
-    A′ = TensorKit.similarstoragetype(A, T)
-    return BraidingTensor{T, S, A′}(space(x), x.adjoint)
-end
-function Adapt.adapt_structure(::Type{TA}, x::BraidingTensor{T, S, A}) where {T′, TA <: DenseArray{T′}, T, S, A}
-    return BraidingTensor{T′, S, TA}(space(x), x.adjoint)
-end
-
-end
diff --git a/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl b/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl
index 1a5c28f7c..aa7320f32 100644
--- a/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl
+++ b/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl
@@ -7,6 +7,8 @@ import CUDA.cuRAND: rand as curand, rand! as curand!, randn as curandn, randn! a
 using Strided: StridedViews
 using CUDA.CUDACore.KernelAbstractions: @kernel, @index, get_backend
 
+using Adapt: Adapt
+
 using TensorKit
 using TensorKit.Factorizations
 using TensorKit.Strided
diff --git a/ext/TensorKitCUDAExt/cutensormap.jl b/ext/TensorKitCUDAExt/cutensormap.jl
index 4fa8c0dea..016749fce 100644
--- a/ext/TensorKitCUDAExt/cutensormap.jl
+++ b/ext/TensorKitCUDAExt/cutensormap.jl
@@ -155,5 +155,3 @@ for f in (:sqrt, :log, :asin, :acos, :acosh, :atanh, :acoth)
         return tf
     end
 end
-
-TensorKit.adapt_transformer(U::AbstractMatrix, ::Type{A}) where {A <: CuVector} = CUDA.CUDACore.Adapt.adapt(CuArray, U)
diff --git a/ext/TensorKitCUDAExt/truncation.jl b/ext/TensorKitCUDAExt/truncation.jl
index 4b2111c50..a87e6c97a 100644
--- a/ext/TensorKitCUDAExt/truncation.jl
+++ b/ext/TensorKitCUDAExt/truncation.jl
@@ -51,19 +51,19 @@ end
 function MatrixAlgebraKit.findtruncated_svd(values::CuSectorVector, strategy::S) where {S <: MatrixAlgebraKit.TruncationStrategy}
     # returning a CuSectorVector wrecks things in truncate_{co}domain
     # because of scalar indexing
-    return CUDA.CUDACore.Adapt.adapt(Vector, MatrixAlgebraKit.findtruncated(values, strategy))
+    return Adapt.adapt(Vector, MatrixAlgebraKit.findtruncated(values, strategy))
 end
 
 for strat in (:(MatrixAlgebraKit.TruncationByOrder), :(MatrixAlgebraKit.TruncationByError), :(MatrixAlgebraKit.TruncationIntersection), :(TensorKit.Factorizations.TruncationSpace))
     @eval function MatrixAlgebraKit.findtruncated_svd(values::CuSectorVector, strategy::$strat)
         # returning a CuSectorVector wrecks things in truncate_{co}domain
         # because of scalar indexing
-        return CUDA.CUDACore.Adapt.adapt(Vector, MatrixAlgebraKit.findtruncated(values, strategy))
+        return Adapt.adapt(Vector, MatrixAlgebraKit.findtruncated(values, strategy))
     end
 end
 
 function MatrixAlgebraKit.findtruncated_svd(values::CuSectorVector, strategy::MatrixAlgebraKit.TruncationByValue)
     atol = TensorKit.Factorizations.rtol_to_atol(values, strategy.p, strategy.atol, strategy.rtol)
     strategy′ = trunctol(; atol, strategy.by, strategy.keep_below)
-    return SectorDict(c => CUDA.CUDACore.Adapt.adapt(Vector, MatrixAlgebraKit.findtruncated_svd(d, strategy′)) for (c, d) in pairs(values))
+    return SectorDict(c => Adapt.adapt(Vector, MatrixAlgebraKit.findtruncated_svd(d, strategy′)) for (c, d) in pairs(values))
 end
diff --git a/src/TensorKit.jl b/src/TensorKit.jl
index db3d9c50f..6a2828588 100644
--- a/src/TensorKit.jl
+++ b/src/TensorKit.jl
@@ -149,6 +149,8 @@ import Base.Meta
 
 using Random: Random, rand!, randn!
 
+using Adapt: Adapt
+
 # Auxiliary files
 #-----------------
 include("auxiliary/auxiliary.jl")
diff --git a/src/tensors/adjoint.jl b/src/tensors/adjoint.jl
index ca484e77b..820e87375 100644
--- a/src/tensors/adjoint.jl
+++ b/src/tensors/adjoint.jl
@@ -50,6 +50,8 @@ Base.@propagate_inbounds function subblock(t::AdjointTensorMap, (f₁, f₂)::Tu
     return permutedims(conj(data), (domainind(tp)..., codomainind(tp)...))
 end
 
+Adapt.adapt_structure(to, x::AdjointTensorMap) = adjoint(Adapt.adapt(to, parent(x)))
+
 # Show
 #------
 function Base.showarg(io::IO, t::AdjointTensorMap, toplevel::Bool)
diff --git a/src/tensors/braidingtensor.jl b/src/tensors/braidingtensor.jl
index f567a3a2b..a1b7dbd02 100644
--- a/src/tensors/braidingtensor.jl
+++ b/src/tensors/braidingtensor.jl
@@ -53,6 +53,14 @@ function BraidingTensor{T}(V::HomSpace, adjoint::Bool = false) where {T}
     return BraidingTensor{T}(V[2], V[1], adjoint)
 end
 
+function Adapt.adapt_structure(::Type{T}, x::BraidingTensor{T′, S, A}) where {T <: Number, T′, S, A}
+    A′ = TensorKit.similarstoragetype(A, T)
+    return BraidingTensor{T, S, A′}(space(x), x.adjoint)
+end
+function Adapt.adapt_structure(::Type{TA}, x::BraidingTensor{T, S, A}) where {T′, TA <: DenseArray{T′}, T, S, A}
+    return BraidingTensor{T′, S, TA}(space(x), x.adjoint)
+end
+
 function Base.adjoint(b::BraidingTensor{T, S, A}) where {T, S, A}
     return BraidingTensor{T, S, A}(b.V1, b.V2, !b.adjoint)
 end
diff --git a/src/tensors/diagonal.jl b/src/tensors/diagonal.jl
index b2ac4134b..5fc6682e3 100644
--- a/src/tensors/diagonal.jl
+++ b/src/tensors/diagonal.jl
@@ -133,6 +133,11 @@ function Base.convert(::Type{DiagonalTensorMap}, d::Dict{Symbol, Any})
     return convert(DiagonalTensorMap, convert(TensorMap, d))
 end
 
+function Adapt.adapt_structure(to, x::DiagonalTensorMap)
+    data′ = Adapt.adapt(to, x.data)
+    return DiagonalTensorMap(data′, x.domain)
+end
+
 # Complex, real and imaginary parts
 #-----------------------------------
 for f in (:real, :imag, :complex)
diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl
index 1d60ae32e..a1d537371 100644
--- a/src/tensors/indexmanipulations.jl
+++ b/src/tensors/indexmanipulations.jl
@@ -641,8 +641,8 @@ function add_transform_kernel!(
                     ptriv, false, One(), Zero(), backend, allocator
                 )
             end
-            U′ = adapt_transformer(U, storagetype(tdst))
-            mul!(buffer_dst, buffer_src, transpose(StridedView(U′)))
+            U′ = Adapt.adapt(StridedView(U), storagetype(tdst))
+            mul!(buffer_dst, buffer_src, transpose(U′))
             @inbounds for (i, (f₃, f₄)) in enumerate(fusiontrees(dst))
                 TO.tensoradd!(
                     tdst[f₃, f₄], sreshape(buffer_dst[:, i], sz_src),
@@ -724,8 +724,8 @@ function add_transform_kernel!(
 
             # 2. Recoupling: buffer_dst = buffer_src * U^T  (each output tree is a linear
             #    combination of input trees weighted by the recoupling coefficients).
-            U′ = adapt_transformer(U, typeof(data_dst))
-            mul!(buffer_dst, buffer_src, transpose(StridedView(U′)))
+            U′ = Adapt.adapt(StridedView(U), typeof(data_dst))
+            mul!(buffer_dst, buffer_src, transpose(U′))
 
             # 3. Insert: scatter column i of buffer_dst into the destination, applying the
             #    actual index permutation p in the same tensoradd! call.
@@ -741,12 +741,3 @@ function add_transform_kernel!(
     TO.allocator_reset!(allocator, cp)
     return nothing
 end
-
-"""
-    adapt_transformer(U::AbstractMatrix, ::Type{A})
-
-Return a version of the basis transformation `U` that is compatible for storage type `A`.
-Default is a no-op.
-Backends (e.g. CUDA, AMDGPU) should overload this for their vector types to ensure the recoupling matrix `U` is on the correct device.
-"""
-adapt_transformer(U::AbstractMatrix, ::Type{A}) where {A} = U
diff --git a/src/tensors/tensor.jl b/src/tensors/tensor.jl
index bd6609163..64ae90a69 100644
--- a/src/tensors/tensor.jl
+++ b/src/tensors/tensor.jl
@@ -554,3 +554,8 @@ function Base.promote_rule(
     A = promote_storagetype(VectorInterface.promote_add(scalartype(TT₁), scalartype(TT₂)), TT₁, TT₂)
     return tensormaptype(S, N₁, N₂, A)
 end
+
+function Adapt.adapt_structure(to, x::TensorMap)
+    data = Adapt.adapt(to, x.data)
+    return TensorMap{eltype(data)}(data, space(x))
+end

From cdd6d163667c92ca9f493957c058bc2788a07185 Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Tue, 19 May 2026 09:13:44 -0400
Subject: [PATCH 29/33] clean up

---
 src/tensors/indexmanipulations.jl | 58 ++++++++++---------------------
 1 file changed, 18 insertions(+), 40 deletions(-)

diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl
index a1d537371..61bb5536f 100644
--- a/src/tensors/indexmanipulations.jl
+++ b/src/tensors/indexmanipulations.jl
@@ -546,14 +546,8 @@ Base.@deprecate(
         tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, transformer,
         α::Number, β::Number, backend, allocator
     )
-    # `permute` is used as a stand-in for all index rearrangements here: permute, braid, and
-    # transpose all produce the same destination space for a given permutation tuple `p`.
     @boundscheck spacecheck_transform(permute, tdst, tsrc, p)
 
-    # Three cases, from cheapest to most expensive:
-    #   1. trivial permutation: delegate to `add!` which handles α/β scaling directly
-    #   2. Trivial sector type: no fusion tree bookkeeping, call tensoradd! on the raw array
-    #   3. general case: iterate over (blocks of) fusion trees, potentially multi-threaded
     if p[1] === codomainind(tsrc) && p[2] === domainind(tsrc)
         add!(tdst, tsrc, α, β)
     else
@@ -586,8 +580,6 @@ function add_transform_kernel!(
     )
     I = sectortype(tdst)
     if FusionStyle(I) === UniqueFusion()
-        # Abelian / unique-fusion: each source fusion tree pair (f₁, f₂) maps to exactly
-        # one destination pair (f₁′, f₂′) with a scalar coefficient. No mixing occurs.
         tforeach(fusiontrees(tsrc); scheduler) do (f₁, f₂)
             (f₁′, f₂′), coeff = transformer((f₁, f₂))
             @inbounds TO.tensoradd!(
@@ -597,9 +589,6 @@ function add_transform_kernel!(
         return nothing
     end
     cp = TO.allocator_checkpoint!(allocator)
-    # Non-Abelian fusion: trees sharing the same set of uncoupled (external) sectors
-    # form a *fusion block* and mix under the transformation via a recoupling matrix U
-    # (rows = destination trees, columns = source trees). We iterate over blocks
 
     # buffers have to be created without race condition: err on the side of caution
     buffersz = 2 * length(space(tdst))
@@ -610,39 +599,39 @@ function add_transform_kernel!(
     OhMyThreads.@tasks for src in fusionblocks(tsrc)
         # setup
         OhMyThreads.@set scheduler = scheduler
-        OhMyThreads.@local buffer = generate_buffer()
-
         dst, U = transformer(src)
 
-        if length(src) == 1
-            # Degenerate block: single tree, U is a 1×1 scalar — skip the buffer + matmul.
+        if length(src) == 1 # Degenerate block with a single tree: no matmul needed.
             (f₁, f₂) = only(fusiontrees(src))
             (f₁′, f₂′) = only(fusiontrees(dst))
             @inbounds TO.tensoradd!(
                 tdst[f₁′, f₂′], tsrc[f₁, f₂], p, false, α * only(U), β, backend, allocator
             )
-        else
-            # Multi-tree block: apply recoupling via a three-step pack → matmul → unpack.
-            #   1. Extract: flatten each source block into a column of buffer_src
-            #      (shape blocksize × cols), using a trivial permutation so that the
-            #      index layout is canonical before the matmul.
-            #   2. Recoupling: buffer_dst = buffer_src * U^T  (blocksize × rows)
-            #   3. Insert: scatter columns of buffer_dst to destination blocks,
-            #      applying the actual permutation p in the same step.
+        else # Multi-tree block: pack → recoupling matmul → unpack.
+            OhMyThreads.@local buffer = generate_buffer()
             rows, cols = size(U)
             sz_src = size(tsrc[first(fusiontrees(src))...])
             blocksize = prod(sz_src)
             ptriv = (ntuple(identity, length(sz_src)), ())
             buffer_dst = StridedView(buffer, (blocksize, rows), (1, blocksize), 0)
             buffer_src = StridedView(buffer, (blocksize, cols), (1, blocksize), blocksize * rows)
+
+            # 1. Extract: copy each source block into column i of buffer_src as a flat vector,
+            #    using a trivial permutation so the layout is canonical before the matmul.
             @inbounds for (i, (f₁, f₂)) in enumerate(fusiontrees(src))
                 TO.tensoradd!(
                     sreshape(buffer_src[:, i], sz_src), tsrc[f₁, f₂],
                     ptriv, false, One(), Zero(), backend, allocator
                 )
             end
-            U′ = Adapt.adapt(StridedView(U), storagetype(tdst))
+
+            # 2. Recoupling: buffer_dst = buffer_src * U^T  (each output tree is a linear
+            #    combination of input trees weighted by the recoupling coefficients).
+            U′ = Adapt.adapt(storagetype(tdst), StridedView(U))
             mul!(buffer_dst, buffer_src, transpose(U′))
+
+            # 3. Insert: scatter column i of buffer_dst into the destination, applying the
+            #    actual index permutation p in the same tensoradd! call.
             @inbounds for (i, (f₃, f₄)) in enumerate(fusiontrees(dst))
                 TO.tensoradd!(
                     tdst[f₃, f₄], sreshape(buffer_dst[:, i], sz_src),
@@ -657,15 +646,11 @@ function add_transform_kernel!(
 end
 
 # TensorMap specializations: operate directly on the flat data vector to avoid
-# repeated dictionary lookups into t.data. The transformer has precomputed all
-# StridedView descriptors (size, offset, strides) for each fusion tree block.
-# No symmetry types left -- no repeated specialization needed
+# repeated specialization -- this only depends on `numind` and `eltype`.
 function add_transform_kernel!(
         data_dst::DenseVector, data_src::DenseVector, p, transformer::AbelianTreeTransformer,
         α, β, backend, allocator, scheduler
     )
-    # Each entry is (coeff, struct_dst, struct_src) where struct_{dst,src} = (size, offset, strides)
-    # locating the block for one fusion tree pair inside the flat data vector.
     tforeach(transformer.data; scheduler) do (coeff, struct_dst, struct_src)
         TO.tensoradd!(
             StridedView(data_dst, struct_dst...), StridedView(data_src, struct_src...),
@@ -678,10 +663,6 @@ function add_transform_kernel!(
         data_dst::DenseVector, data_src::DenseVector, p, transformer::GenericTreeTransformer,
         α, β, backend, allocator, scheduler
     )
-    # Each entry covers one fusion block:
-    #   U            — recoupling matrix (rows = dst trees, cols = src trees)
-    #   sz_{dst,src} — array shape of each block (same for all trees in the block)
-    #   structs_{dst,src}[i] — (offset, strides) into the flat data vector for tree i
     cp = TO.allocator_checkpoint!(allocator)
 
     # buffers have to be created without race condition: err on the side of caution
@@ -693,20 +674,17 @@ function add_transform_kernel!(
     OhMyThreads.@tasks for subtransformer in transformer.data
         # setup
         OhMyThreads.@set scheduler = scheduler
-        OhMyThreads.@local buffer = generate_buffer()
         U, (sz_dst, structs_dst), (sz_src, structs_src) = subtransformer
 
-        if length(U) == 1
-            # Degenerate block with a single tree: no matmul needed.
+        if length(U) == 1 # Degenerate block with a single tree: no matmul needed.
             coeff = only(U)
             TO.tensoradd!(
                 StridedView(data_dst, sz_dst, only(structs_dst)...),
                 StridedView(data_src, sz_src, only(structs_src)...),
                 p, false, α * coeff, β, backend, allocator
             )
-        else
-            # Multi-tree block: pack → recoupling matmul → unpack.
-            # buffer2 = source staging area, buffer1 = destination staging area.
+        else # Multi-tree block: pack → recoupling matmul → unpack.
+            OhMyThreads.@local buffer = generate_buffer()
             rows, cols = size(U)
             blocksize = prod(sz_src)
             ptriv = (ntuple(identity, length(sz_src)), ())
@@ -724,7 +702,7 @@ function add_transform_kernel!(
 
             # 2. Recoupling: buffer_dst = buffer_src * U^T  (each output tree is a linear
             #    combination of input trees weighted by the recoupling coefficients).
-            U′ = Adapt.adapt(StridedView(U), typeof(data_dst))
+            U′ = Adapt.adapt(typeof(data_dst), StridedView(U))
             mul!(buffer_dst, buffer_src, transpose(U′))
 
             # 3. Insert: scatter column i of buffer_dst into the destination, applying the

From f127e94c560f886d49ba633015326fb3e35f2dab Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Tue, 19 May 2026 09:27:39 -0400
Subject: [PATCH 30/33] more clean up with buffers

---
 src/tensors/indexmanipulations.jl | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl
index 61bb5536f..655f0ca7f 100644
--- a/src/tensors/indexmanipulations.jl
+++ b/src/tensors/indexmanipulations.jl
@@ -542,6 +542,8 @@ Base.@deprecate(
     transpose!(tdst, tsrc, p, α, β, backend...)
 )
 
+# Kernel implementation
+# ---------------------
 @propagate_inbounds function add_transform!(
         tdst::AbstractTensorMap, tsrc::AbstractTensorMap, p::Index2Tuple, transformer,
         α::Number, β::Number, backend, allocator
@@ -590,11 +592,8 @@ function add_transform_kernel!(
     end
     cp = TO.allocator_checkpoint!(allocator)
 
-    # buffers have to be created without race condition: err on the side of caution
-    buffersz = 2 * length(space(tdst))
-    generate_buffer = let lock = Threads.ReentrantLock(), allocator = allocator
-        () -> @lock lock TO.tensoralloc(storagetype(tdst), buffersz, Val(true), allocator)
-    end
+    # buffers have to be created without race condition: err on the side of caution with a lock
+    buffer_lock = Threads.ReentrantLock()
 
     OhMyThreads.@tasks for src in fusionblocks(tsrc)
         # setup
@@ -608,10 +607,10 @@ function add_transform_kernel!(
                 tdst[f₁′, f₂′], tsrc[f₁, f₂], p, false, α * only(U), β, backend, allocator
             )
         else # Multi-tree block: pack → recoupling matmul → unpack.
-            OhMyThreads.@local buffer = generate_buffer()
             rows, cols = size(U)
             sz_src = size(tsrc[first(fusiontrees(src))...])
             blocksize = prod(sz_src)
+            buffer = @lock buffer_lock TO.tensoralloc(storagetype(tdst), blocksize * (rows + cols), Val(true), allocator)
             ptriv = (ntuple(identity, length(sz_src)), ())
             buffer_dst = StridedView(buffer, (blocksize, rows), (1, blocksize), 0)
             buffer_src = StridedView(buffer, (blocksize, cols), (1, blocksize), blocksize * rows)
@@ -638,7 +637,7 @@ function add_transform_kernel!(
                     p, false, α, β, backend, allocator
                 )
             end
-            TO.tensorfree!(buffer, allocator)
+            @lock buffer_lock TO.tensorfree!(buffer, allocator)
         end
     end
     TO.allocator_reset!(allocator, cp)
@@ -665,11 +664,8 @@ function add_transform_kernel!(
     )
     cp = TO.allocator_checkpoint!(allocator)
 
-    # buffers have to be created without race condition: err on the side of caution
-    buffersz = 2 * buffersize(transformer)
-    generate_buffer = let lock = Threads.ReentrantLock(), allocator = allocator
-        () -> @lock lock TO.tensoralloc(typeof(data_dst), buffersz, Val(true), allocator)
-    end
+    # buffers have to be created without race condition: err on the side of caution with a lock
+    buffer_lock = Threads.ReentrantLock()
 
     OhMyThreads.@tasks for subtransformer in transformer.data
         # setup
@@ -684,9 +680,9 @@ function add_transform_kernel!(
                 p, false, α * coeff, β, backend, allocator
             )
         else # Multi-tree block: pack → recoupling matmul → unpack.
-            OhMyThreads.@local buffer = generate_buffer()
             rows, cols = size(U)
             blocksize = prod(sz_src)
+            buffer = @lock buffer_lock TO.tensoralloc(typeof(data_dst), blocksize * (rows + cols), Val(true), allocator)
             ptriv = (ntuple(identity, length(sz_src)), ())
             buffer_dst = StridedView(buffer, (blocksize, rows), (1, blocksize), 0)
             buffer_src = StridedView(buffer, (blocksize, cols), (1, blocksize), blocksize * rows)
@@ -713,7 +709,7 @@ function add_transform_kernel!(
                     p, false, α, β, backend, allocator
                 )
             end
-            TO.tensorfree!(buffer, allocator)
+            @lock buffer_lock TO.tensorfree!(buffer, allocator)
         end
     end
     TO.allocator_reset!(allocator, cp)

From e19b72abd2acd45a4348d75944a75a8150c4c118 Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Tue, 19 May 2026 15:44:01 -0400
Subject: [PATCH 31/33] add docs about fermionic contraction

---
 docs/src/man/contractions.md                  | 154 ++++++++-
 .../man/img/tensor-fermioniccontraction.svg   | 317 ++++++++++++++++++
 2 files changed, 470 insertions(+), 1 deletion(-)
 create mode 100644 docs/src/man/img/tensor-fermioniccontraction.svg

diff --git a/docs/src/man/contractions.md b/docs/src/man/contractions.md
index d70b5681d..10c868ae3 100644
--- a/docs/src/man/contractions.md
+++ b/docs/src/man/contractions.md
@@ -99,8 +99,160 @@ For e.g. an instance `A::TensorMap{T, S, 3, 2}`, the following two syntaxes have
 
 ## Fermionic tensor contractions
 
-TODO
+Whenever `BraidingStyle(i) == Fermionic()`, some complications come up.
+The most important distinction from the `Bosonic()` case is that twists are no longer trivial, such that we must be careful about how we can manipulate network diagrams.
+
+To illustrate these complications, we take a look at a concrete example first, and study the following tensor network:
+
+```@raw html
+<img src="../img/tensor-fermioniccontraction.svg" alt="fermionic contraction example" class="color-invertible"/>
+```
+
+```@example fermioncontraction
+using TensorKit # hide
+V₁ = Vect[FermionParity](0 => 1, 1 => 1)
+V₂ = Vect[FermionParity](0 => 2, 1 => 2)
+A = rand(V₁ ← V₁ ⊗ V₂)
+X = rand(V₁ ← V₁)
+B = rand(V₁ ⊗ V₂ ← V₁)
+```
+
+We can expand this into binary contractions, by first contracting `X` with `A`, and then contracting the result with `B`:
+
+```@example fermioncontraction
+AX = repartition(A, 2, 1) * X
+AXB = repartition(AX, 1, 2) * B
+```
+
+Alternatively, we could decide that we first wish to contract `A` with `B`, and only then contract the result with `X`:
+
+```@example fermioncontraction
+AB = permute(A, ((1, 2), (3,))) * permute(B, ((2,), (1, 3)))
+ABX = repartition(permute(AB, ((1, 4), (2, 3))) * repartition(X, 2, 0), 1, 1)
+```
+
+This is where the issue becomes clearer, as the results are no longer equal:
+
+```@example fermioncontraction
+AXB ≈ ABX
+```
+
+### Trivializing the twist
+
+So what happened?
+If we carefully inspect what we actually computed here, we can show that in order to deform one diagram into the other, we have to introduce a self-crossing, which then altered the result.
+While the example here is still simple to follow, in general we would like that the result of `@tensor` expressions does not depend on the input order of the tensors.
+This is especially true for larger expressions where we wish to dynamically compute the optimal contraction order, as this would alter the order in a very non-transparent manner.
+
+The way out of this effectively consists of absorbing this twist in the coevaluation map ``η``.
+This modified map ``̃η := η ∘ θ`` where ``θ`` represents the twist ensures that the result no longer depends on the order of evaluation.
+In particular, one can show that any time two tensors would swap places, we would simultaneously exchange one evaluation map ``ϵ`` for a coevaluation ``̃η``, while also incurring a twist ``θ`` such that both cancel out.
+To make this concrete, we show how our previous example now leads to a unique result:
+
+```@example fermioncontraction
+function fermion_mul(A, B)
+    return A * twist(B, findall(isdual, codomain(B).spaces))
+end
+
+# order I:
+AX = fermion_mul(repartition(A, 2, 1),  X)
+AXB = fermion_mul(repartition(AX, 1, 2) , B)
+
+# order II:
+AB = fermion_mul(permute(A, ((1, 2), (3,))), permute(B, ((2,), (1, 3))))
+ABX = repartition(fermion_mul(permute(AB, ((1, 4), (2, 3))), repartition(X, 2, 0)), 1, 1)
+
+AXB ≈ ABX
+```
+
+This is the so-called **supertrace** formalism, and is effectively what `@tensor` ends up implementing for fermionic contractions.
+For more details about this formalism, we refer to [^Mortier].
+
+```@example fermioncontraction
+# @tensor
+@tensor result[-1; -2] := A[-1; 1 2] * X[1; 3] * B[3 2; -2]
+
+AXB ≈ result
+```
+
+### (Non)-unitarity
+
+While this modified ``̃η`` solves the issues related to contractions, it does come at a cost.
+The main issue is that this map does not constitute a positive definite map, and in particular is at odds with a positive inner product.
+Such a positive inner product is however required to properly define (orthogonal) factorizations, non-negative norms, etc.
+
+Therefore, we reserve the supertrace formalism exclusively for tensor contractions.
+For matrix-like operations such as factorizations, matrix functions, norms, etc, we retain the positive definite inner product.
+It is also always possible to manually emulate one or the other, by inserting appropriate calls to `twist`.
+In what follows, we simply showcase some noteworthy differences between the two formalisms, as these can be a common source of errors.
+Throughout, we use the following simple fermionic tensor as a running example:
+
+```@example fermionnorm
+using TensorKit # hide
+V = Vect[FermionParity](0 => 1, 1 => 1)
+t = ones(V' ← V')
+```
+
+- Computing a norm via a contraction:
+  the squared norm of `t`, computed via the supertrace contraction, no longer agrees with `norm(t)^2`.
+  In particular, the `@tensor` self-contraction can even vanish for a manifestly non-zero tensor:
+
+```@example fermionnorm
+norm(t)^2, @tensor conj(t[a; b]) * t[a; b]
+```
+
+Inserting a `twist` on the contracted codomain index cancels the twist that `@tensor` automatically introduces, and recovers the trace-formalism result:
+
+```@example fermionnorm
+norm(t)^2 ≈ @tensor conj(t[a; b]) * twist(t, 1)[a; b]
+```
+
+- Using unitarity to simplify `U * U' ≈ I`:
+  the factor `U` returned by `svd_compact` is left-isometric in the *trace* sense, i.e. `U' * U ≈ id(domain(U))` as a matrix product, but this identity no longer holds when the same product is written as a tensor contraction:
+
+```@example fermionnorm
+U, S, Vᴴ = svd_compact(t)
+@tensor UdU[i; j] := conj(U[k; i]) * U[k; j]
+U' * U ≈ id(domain(U)), UdU ≈ id(domain(U))
+```
+
+The matrix-mul version satisfies orthogonality, but the `@tensor` version differs by the fermionic twist on the contracted index.
+This is a common pitfall whenever an isometry obtained from a factorization is fed straight into an `@tensor` expression.
+
+- Computing a matrix function through a manual Taylor expansion:
+  matrix functions such as `exp`, `log`, `sqrt` are defined through the matrix product (trace formalism) and therefore have no immediate counterpart in terms of `@tensor` expressions.
+  In particular, replacing each matrix power by an `@tensor` self-contraction yields a different result, even at low order:
+
+```@example fermionnorm
+function exp_via_tensor(t, order)
+    out = id(domain(t))
+    tn = id(domain(t))
+    for n in 1:order
+        @tensor next[a; b] := tn[a; c] * t[c; b]
+        tn = next
+        out += tn / factorial(n)
+    end
+    return out
+end
+exp(t) ≈ exp_via_tensor(t, 10)
+```
+
+The same Taylor expansion written with the matrix product instead does reproduce `exp(t)`, confirming that the discrepancy is in the contraction step rather than the truncation order:
+
+```@example fermionnorm
+exp_via_mul(t, order) = sum(t^n / factorial(n) for n in 0:order)
+exp(t) ≈ exp_via_mul(t, 10)
+```
+
+!!! note
+    Both the supertrace and the trace formalism constitute valid, consistent frameworks, each with their own advantages and disadvantages.
+    For practical applications, it can be convenient to select one or the other, and to take special care when trying to use properties of one framework in the other.
+    In general, each case must be carefully evaluated to check which framework is correct, but a good rule of thumb is to be careful when using properties of orthogonality in combination with `@tensor` expressions.
+
 
 ## Anyonic tensor contractions
 
 TODO
+
+
+[^Mortier]:      Mortier, Q., Devos, L., Burgelman, L., et al. (2025). Fermionic Tensor Network Methods. SciPost Physics 18, no. 1. [10.21468/SciPostPhys.18.1.012](https://doi.org/10.21468/SciPostPhys.18.1.012).
diff --git a/docs/src/man/img/tensor-fermioniccontraction.svg b/docs/src/man/img/tensor-fermioniccontraction.svg
new file mode 100644
index 000000000..c0ec84cf9
--- /dev/null
+++ b/docs/src/man/img/tensor-fermioniccontraction.svg
@@ -0,0 +1,317 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<svg
+   width="66.532455"
+   height="206.77463"
+   style="fill:none"
+   version="1.1"
+   id="svg212"
+   sodipodi:docname="tensor-fermioniccontraction.svg"
+   inkscape:version="1.4.2 (ebf0e940, 2025-05-08)"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:svg="http://www.w3.org/2000/svg">
+  <defs
+     id="defs212">
+    <rect
+       x="116.36719"
+       y="157.54327"
+       width="15.913462"
+       height="17.106972"
+       id="rect215" />
+    <rect
+       x="135.46333"
+       y="227.16466"
+       width="14.322115"
+       height="15.117788"
+       id="rect214" />
+    <rect
+       x="127.90445"
+       y="88.717545"
+       width="27.052885"
+       height="15.117788"
+       id="rect213" />
+  </defs>
+  <sodipodi:namedview
+     id="namedview212"
+     pagecolor="#ffffff"
+     bordercolor="#000000"
+     borderopacity="0.25"
+     inkscape:showpageshadow="2"
+     inkscape:pageopacity="0.0"
+     inkscape:pagecheckerboard="0"
+     inkscape:deskcolor="#d1d1d1"
+     inkscape:zoom="1.7773802"
+     inkscape:cx="87.769629"
+     inkscape:cy="237.70941"
+     inkscape:window-width="3008"
+     inkscape:window-height="1634"
+     inkscape:window-x="1512"
+     inkscape:window-y="30"
+     inkscape:window-maximized="0"
+     inkscape:current-layer="svg212"
+     showguides="false" />
+  <g
+     id="g165"
+     style="fill:none;stroke:#000000"
+     transform="translate(-17.860179,-7.997986)" />
+  <g
+     id="g167"
+     transform="translate(-17.860179,-7.997986)" />
+  <g
+     id="g168"
+     transform="translate(-17.860179,-7.997986)" />
+  <g
+     id="g169"
+     transform="translate(-17.860179,-7.997986)" />
+  <g
+     id="g215"
+     transform="translate(-105.62981,-62.010066)">
+    <g
+       class="composite-shape"
+       id="g4"
+       style="fill:none;stroke:#000000"
+       transform="translate(-23.870192,-36.600962)">
+      <path
+         class="real"
+         d="m 130,193 c 0,-3.31 2.69,-6 6,-6 h 28 l 6,6 v 24 h -40 z"
+         style="fill:none;stroke:#000000;stroke-width:1"
+         id="path4" />
+    </g>
+    <g
+       class="composite-shape"
+       id="g123"
+       style="fill:none;stroke:#000000"
+       transform="translate(61.662264,44.557692)">
+      <path
+         class="real"
+         d="m 50,43 c 0,-3.31 2.69,-6 6,-6 h 48 l 6,6 V 67 H 50 Z"
+         style="fill:none;stroke:#000000;stroke-width:1"
+         id="path122" />
+    </g>
+    <g
+       class="composite-shape"
+       id="g123-9"
+       style="fill:none;stroke:#000000"
+       transform="translate(61.662264,181.99039)">
+      <path
+         class="real"
+         d="m 50,43 c 0,-3.31 2.69,-6 6,-6 h 48 l 6,6 V 67 H 50 Z"
+         style="fill:none;stroke:#000000;stroke-width:1"
+         id="path122-0" />
+    </g>
+    <g
+       class="arrow-line"
+       id="g126"
+       style="fill:none;stroke:#000000"
+       transform="translate(42.102416,181.78469)">
+      <path
+         class="connection real"
+         stroke-dasharray="none"
+         d="M 100,67 V 87"
+         style="fill:none;stroke:#000000;stroke-width:1"
+         id="path125" />
+    </g>
+    <g
+       class="arrow-line"
+       id="g128"
+       style="fill:none;stroke:#000000"
+       transform="translate(51.19895,45.010066)">
+      <path
+         class="connection real"
+         stroke-dasharray="none"
+         d="M 90,17 V 37"
+         style="fill:none;stroke:#000000;stroke-width:1"
+         id="path127" />
+    </g>
+    <g
+       id="g213">
+      <g
+         id="g212">
+        <g
+           class="arrow-line"
+           id="g124"
+           style="fill:none;stroke:#000000"
+           transform="matrix(1,0,0,1.9349158,65.245192,-18.081671)">
+          <path
+             class="connection real"
+             stroke-dasharray="none"
+             d="M 60,67 V 87"
+             style="fill:none;stroke:#000000;stroke-width:1"
+             id="path123" />
+        </g>
+      </g>
+      <g
+         class="arrow-line"
+         id="g130"
+         style="fill:none;stroke:#000000"
+         transform="translate(65.245193,49.331731)">
+        <path
+           class="connection real"
+           stroke-dasharray="none"
+           d="m 60,77 v 7"
+           style="fill:none;stroke:#000000;stroke-width:1"
+           id="path128" />
+        <g
+           stroke="none"
+           fill="#000"
+           transform="rotate(-90,73.5,13.5)"
+           style="fill:#000000;stroke:none;stroke-width:1"
+           id="g129">
+          <path
+             d="M 8.93,-4.29 0,0 8.93,4.29 Z"
+             id="path129" />
+        </g>
+      </g>
+    </g>
+    <g
+       id="g212-8"
+       transform="matrix(1,0,0,2.7682393,35.839281,-196.88329)"
+       style="fill:none">
+      <g
+         class="arrow-line"
+         id="g124-1"
+         style="fill:none;stroke:#000000"
+         transform="matrix(1,0,0,1.9349158,65.245192,-18.081671)">
+        <path
+           class="connection real"
+           stroke-dasharray="none"
+           d="M 60,67 V 87"
+           style="fill:none;stroke:#000000;stroke-width:1"
+           id="path123-4" />
+      </g>
+    </g>
+    <g
+       class="arrow-line"
+       id="g130-7"
+       style="fill:none;stroke:#000000"
+       transform="translate(101.08447,86.310099)">
+      <path
+         class="connection real"
+         stroke-dasharray="none"
+         d="m 60,77 v 7"
+         style="fill:none;stroke:#000000;stroke-width:1"
+         id="path128-1" />
+      <g
+         stroke="none"
+         fill="#000"
+         transform="rotate(-90,73.5,13.5)"
+         style="fill:#000000;stroke:none;stroke-width:1"
+         id="g129-3">
+        <path
+           d="M 8.93,-4.29 0,0 8.93,4.29 Z"
+           id="path129-8" />
+      </g>
+    </g>
+    <g
+       id="g213-6"
+       style="fill:none"
+       transform="translate(-0.35302654,68.870195)">
+      <g
+         id="g212-5">
+        <g
+           class="arrow-line"
+           id="g124-7"
+           style="fill:none;stroke:#000000"
+           transform="matrix(1,0,0,1.9349158,65.245192,-18.081671)">
+          <path
+             class="connection real"
+             stroke-dasharray="none"
+             d="M 60,67 V 87"
+             style="fill:none;stroke:#000000;stroke-width:1"
+             id="path123-6" />
+        </g>
+      </g>
+      <g
+         class="arrow-line"
+         id="g130-2"
+         style="fill:none;stroke:#000000"
+         transform="translate(65.245193,49.331731)">
+        <path
+           class="connection real"
+           stroke-dasharray="none"
+           d="m 60,77 v 7"
+           style="fill:none;stroke:#000000;stroke-width:1"
+           id="path128-7" />
+        <g
+           stroke="none"
+           fill="#000"
+           transform="rotate(-90,73.5,13.5)"
+           style="fill:#000000;stroke:none;stroke-width:1"
+           id="g129-0">
+          <path
+             d="M 8.93,-4.29 0,0 8.93,4.29 Z"
+             id="path129-3" />
+        </g>
+      </g>
+    </g>
+    <g
+       class="arrow-line"
+       id="g132"
+       style="fill:none;stroke:#000000"
+       transform="translate(42.102416,181.78469)">
+      <path
+         class="connection real"
+         stroke-dasharray="none"
+         d="m 100,77 v 7"
+         style="fill:none;stroke:#000000;stroke-width:1"
+         id="path130" />
+      <g
+         stroke="none"
+         fill="#000"
+         transform="rotate(-90,93.5,-6.5)"
+         style="fill:#000000;stroke:none;stroke-width:1"
+         id="g131">
+        <path
+           d="M 8.93,-4.29 0,0 8.93,4.29 Z"
+           id="path131" />
+      </g>
+    </g>
+    <g
+       class="arrow-line"
+       id="g134"
+       style="fill:none;stroke:#000000"
+       transform="translate(51.19895,45.010066)">
+      <path
+         class="connection real"
+         stroke-dasharray="none"
+         d="m 90,17 v 7"
+         style="fill:none;stroke:#000000;stroke-width:1"
+         id="path132" />
+      <g
+         stroke="none"
+         fill="#000"
+         transform="rotate(-90,58.5,-31.5)"
+         style="fill:#000000;stroke:none;stroke-width:1"
+         id="g133">
+        <path
+           d="M 8.93,-4.29 0,0 8.93,4.29 Z"
+           id="path133" />
+      </g>
+    </g>
+    <text
+       xml:space="preserve"
+       id="text213"
+       style="text-align:start;writing-mode:lr-tb;direction:ltr;white-space:pre;shape-inside:url(#rect213);display:inline;opacity:1;fill:#000000;fill-opacity:1;stroke:none;stroke-width:5.33333;paint-order:stroke markers fill"
+       transform="translate(7.9567308,1.5913462)"><tspan
+         x="127.9043"
+         y="99.664761"
+         id="tspan1">B</tspan></text>
+    <text
+       xml:space="preserve"
+       id="text214"
+       style="text-align:start;writing-mode:lr-tb;direction:ltr;white-space:pre;shape-inside:url(#rect214);display:inline;opacity:1;fill:#000000;fill-opacity:1;stroke:none;stroke-width:5.33333;paint-order:stroke markers fill"
+       transform="translate(2.3870192,0.39783654)"><tspan
+         x="135.46289"
+         y="238.11203"
+         id="tspan2">A</tspan></text>
+    <text
+       xml:space="preserve"
+       id="text215"
+       style="text-align:start;writing-mode:lr-tb;direction:ltr;white-space:pre;shape-inside:url(#rect215);display:inline;opacity:1;fill:#000000;fill-opacity:1;stroke:none;stroke-width:5.33333;paint-order:stroke markers fill"
+       transform="translate(4.7740385,0.39783654)"><tspan
+         x="116.36719"
+         y="168.49093"
+         id="tspan3">X</tspan></text>
+  </g>
+</svg>

From 026d68f2a5a5923e2710a490bf50ba6fa76db44d Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Tue, 19 May 2026 16:01:58 -0400
Subject: [PATCH 32/33] add docs about anyonic contractions

---
 docs/src/man/contractions.md | 59 +++++++++++++++++++++++++++++++++++-
 1 file changed, 58 insertions(+), 1 deletion(-)

diff --git a/docs/src/man/contractions.md b/docs/src/man/contractions.md
index 10c868ae3..af0dc96a3 100644
--- a/docs/src/man/contractions.md
+++ b/docs/src/man/contractions.md
@@ -252,7 +252,64 @@ exp(t) ≈ exp_via_mul(t, 10)
 
 ## Anyonic tensor contractions
 
-TODO
+When `BraidingStyle(I) == Anyonic()`, the situation is more restrictive still.
+The relevant group describing the exchange of two lines is no longer the permutation group but the full braid group, so even a double crossing is non-trivial and there is no preferred way to reorder lines in a diagram.
+As a consequence, the implicit reordering that `@tensor` performs is no longer well-defined, and attempting an anyonic contraction with `@tensor` raises a `SectorMismatch` error.
+
+```@example anyoncontraction
+using TensorKit # hide
+V = Vect[FibonacciAnyon](:I => 1, :τ => 1)
+A = randn(ComplexF64, V ← V ⊗ V)
+B = randn(ComplexF64, V ⊗ V ← V)
+try
+    @tensor C[i; j] := A[i; k l] * B[k l; j]
+catch err
+    err
+end
+```
+
+The way out is to write the contraction as a literal *planar* diagram, in which every required crossing is made explicit through a braiding tensor.
+This is what the `@planar` macro provides.
+
+### The `@planar` macro
+
+The surface syntax of `@planar` is identical to that of `@tensor`, but with a number of additional restrictions.
+
+A diagram is *planar* in this context when it can be drawn on a sheet of paper without any of its lines crossing, and additionally with all open legs ending on the exterior of the diagram.
+The second condition rules out arrangements in which an open leg is enclosed by contracted ones, even if the resulting diagram itself contains no crossings.
+
+For the macro to recognise this layout unambiguously, the codomain–domain separator `;` must be present in every index list.
+It fixes which legs sit on the top (codomain) and which on the bottom (domain) of each tensor box, and changing the partition can change whether a given index pattern is planar.
+
+Planarity is moreover enforced for each binary contraction, not only for the overall expression.
+The pairwise contraction order can therefore matter: an expression whose final layout is planar may still be rejected when an intermediate contraction produces a non-planar subdiagram.
+Manually controlling the order, for instance via parentheses, NCON-style numbering, or the `order=...` keyword, is still supported, but must be done with care.
+
+Finally, the name `τ` is reserved for the braiding tensor: every literal crossing must be written out as a `τ[a b; c d]` factor, with its adjoint `τ'[a b; c d]` representing the inverse (under-)crossing.
+The `BraidingTensor` itself does not need to be constructed by the user; the macro figures out the appropriate spaces from the surrounding contraction.
+Any layout the macro cannot identify as planar is rejected at parse time with `ArgumentError("not a planar diagram expression: ...")`.
+
+To make this concrete, consider the contraction `A * B` for two anyonic tensors, written in a manifestly planar fashion:
+
+```@example anyoncontraction
+@planar C1[i; j] := A[i; k l] * B[k l; j]
+```
+
+Inserting an explicit braiding tensor on the contracted legs gives a genuinely different result, reflecting the non-trivial R-symbols of the anyon braiding:
+
+```@example anyoncontraction
+@planar C2[i; j] := A[i; k l] * τ[k l; m n] * B[m n; j]
+C1 ≈ C2
+```
+
+Both expressions correspond to valid, but distinct, tensor network diagrams, and the choice between them must be made explicit by the user.
+
+### The `@plansor` macro
+
+For code that should work uniformly across braiding styles, TensorKit provides the `@plansor` macro.
+It inspects the `BraidingStyle` of the first non-braiding tensor in the expression and dispatches to `@tensor` for `Bosonic` sectors, and to `@planar` otherwise.
+Any explicit `τ` factors that appear in the expression are silently removed in the bosonic case, where braidings are trivial, and faithfully evaluated otherwise.
+This makes `@plansor` the natural choice for generic library code that wishes to remain correct regardless of the underlying symmetry.
 
 
 [^Mortier]:      Mortier, Q., Devos, L., Burgelman, L., et al. (2025). Fermionic Tensor Network Methods. SciPost Physics 18, no. 1. [10.21468/SciPostPhys.18.1.012](https://doi.org/10.21468/SciPostPhys.18.1.012).

From 26c844ae5d7562c8c72d398e4ec75e63c0cde88e Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Tue, 19 May 2026 16:10:06 -0400
Subject: [PATCH 33/33] Move documentation changes to separate PR

Docs content is being added back in a stacked follow-up PR to keep this one
reviewable. The minimal docs/src/lib/tensors.md change is kept here because
removing the @docs block for the now-deprecated add_permute!/add_braid!/add_transpose!
wrappers is required for the docs build to succeed.
---
 docs/make.jl                                  |  12 +-
 docs/src/lib/fusiontrees.md                   |   1 -
 docs/src/lib/sectors.md                       |   1 -
 docs/src/lib/spaces.md                        |   1 -
 docs/src/man/contractions.md                  | 315 ----------------
 docs/src/man/factorizations.md                |  46 ---
 .../man/img/tensor-fermioniccontraction.svg   | 317 ----------------
 docs/src/man/indexmanipulations.md            | 108 ------
 docs/src/man/linearalgebra.md                 |  85 -----
 docs/src/man/tensormanipulations.md           | 337 ++++++++++++++++++
 docs/src/man/tensors.md                       |   2 +-
 11 files changed, 341 insertions(+), 884 deletions(-)
 delete mode 100644 docs/src/man/contractions.md
 delete mode 100644 docs/src/man/factorizations.md
 delete mode 100644 docs/src/man/img/tensor-fermioniccontraction.svg
 delete mode 100644 docs/src/man/indexmanipulations.md
 delete mode 100644 docs/src/man/linearalgebra.md
 create mode 100644 docs/src/man/tensormanipulations.md

diff --git a/docs/make.jl b/docs/make.jl
index d72b851a6..34b025580 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -9,7 +9,7 @@ end
 using Documenter
 using Random
 using TensorKit
-using TensorKit: FusionTreePair, FusionTreeBlock, Index2Tuple, IndexTuple
+using TensorKit: FusionTreePair, FusionTreeBlock, Index2Tuple
 using TensorKit.TensorKitSectors
 using TensorKit.MatrixAlgebraKit
 using DocumenterInterLinks
@@ -26,14 +26,8 @@ pages = [
         "man/intro.md", "man/tutorial.md",
         "man/spaces.md", "man/symmetries.md",
         "man/sectors.md", "man/gradedspaces.md",
-        "man/fusiontrees.md",
-        "Tensors" => [
-            "man/tensors.md",
-            "man/linearalgebra.md",
-            "man/indexmanipulations.md",
-            "man/factorizations.md",
-            "man/contractions.md",
-        ],
+        "man/fusiontrees.md", "man/tensors.md",
+        "man/tensormanipulations.md",
     ],
     "Library" => [
         "lib/sectors.md", "lib/fusiontrees.md",
diff --git a/docs/src/lib/fusiontrees.md b/docs/src/lib/fusiontrees.md
index 57033eca6..8e037af93 100644
--- a/docs/src/lib/fusiontrees.md
+++ b/docs/src/lib/fusiontrees.md
@@ -2,7 +2,6 @@
 
 ```@meta
 CurrentModule = TensorKit
-CollapsedDocStrings = true
 ```
 
 # Type hierarchy
diff --git a/docs/src/lib/sectors.md b/docs/src/lib/sectors.md
index 8a696c675..f56980bf0 100644
--- a/docs/src/lib/sectors.md
+++ b/docs/src/lib/sectors.md
@@ -2,7 +2,6 @@
 
 ```@meta
 CurrentModule = TensorKit
-CollapsedDocStrings = true
 ```
 
 ## Type hierarchy
diff --git a/docs/src/lib/spaces.md b/docs/src/lib/spaces.md
index a6cce06a1..e5705fe3e 100644
--- a/docs/src/lib/spaces.md
+++ b/docs/src/lib/spaces.md
@@ -2,7 +2,6 @@
 
 ```@meta
 CurrentModule = TensorKit
-CollapsedDocStrings = true
 ```
 
 ## Type hierarchy
diff --git a/docs/src/man/contractions.md b/docs/src/man/contractions.md
deleted file mode 100644
index af0dc96a3..000000000
--- a/docs/src/man/contractions.md
+++ /dev/null
@@ -1,315 +0,0 @@
-# [Tensor contractions and tensor networks](@id ss_tensor_contraction)
-
-One of the most important operation with tensor maps is to compose them, more generally known as contracting them.
-As mentioned in the section on [category theory](@ref s_categories), a typical composition of maps in a ribbon category can graphically be represented as a planar arrangement of the morphisms (i.e. tensor maps, boxes with lines eminating from top and bottom, corresponding to source and target, i.e. domain and codomain), where the lines connecting the source and targets of the different morphisms should be thought of as ribbons, that can braid over or underneath each other, and that can twist.
-Technically, we can embed this diagram in ``ℝ × [0,1]`` and attach all the unconnected line endings corresponding objects in the source at some position ``(x,0)`` for ``x∈ℝ``, and all line endings corresponding to objects in the target at some position ``(x,1)``.
-The resulting morphism is then invariant under what is known as *framed three-dimensional isotopy*, i.e. three-dimensional rearrangements of the morphism that respect the rules of boxes connected by ribbons whose open endings are kept fixed.
-Such a two-dimensional diagram cannot easily be encoded in a single line of code.
-
-However, things simplify when the braiding is symmetric (such that over- and under- crossings become equivalent, i.e. just crossings), and when twists, i.e. self-crossings in this case, are trivial.
-This amounts to `BraidingStyle(I) == Bosonic()` in the language of TensorKit.jl, and is true for any subcategory of ``\mathbf{Vect}``, i.e. ordinary tensors, possibly with some symmetry constraint.
-The case of ``\mathbf{SVect}`` and its subcategories, and more general categories, are discussed below.
-
-In the case of trivial twists, we can deform the diagram such that we first combine every morphism with a number of coevaluations ``η`` so as to represent it as a tensor, i.e. with a trivial domain.
-We can then rearrange the morphism to be all ligned up horizontally, where the original morphism compositions are now being performed by evaluations ``ϵ``.
-This process will generate a number of crossings and twists, where the latter can be omitted because they act trivially.
-Similarly, double crossings can also be omitted.
-As a consequence, the diagram, or the morphism it represents, is completely specified by the tensors it is composed of, and which indices between the different tensors are connect, via the evaluation ``ϵ``, and which indices make up the source and target of the resulting morphism.
-If we also compose the resulting morphisms with coevaluations so that it has a trivial domain, we just have one type of unconnected lines, henceforth called open indices.
-We sketch such a rearrangement in the following picture
-
-```@raw html
-<img src="../img/tensor-bosoniccontraction.svg" alt="tensor unitary" class="color-invertible"/>
-```
-
-Hence, we can now specify such a tensor diagram, henceforth called a tensor contraction or also tensor network, using a one-dimensional syntax that mimicks [abstract index notation](https://en.wikipedia.org/wiki/Abstract_index_notation) and specifies which indices are connected by the evaluation map using Einstein's summation conventation.
-Indeed, for `BraidingStyle(I) == Bosonic()`, such a tensor contraction can take the same format as if all tensors were just multi-dimensional arrays.
-For this, we rely on the interface provided by the package [TensorOperations.jl](https://github.com/QuantumKitHub/TensorOperations.jl).
-
-The above picture would be encoded as
-```julia
-@tensor E[a, b, c, d, e] := A[v, w, d, x] * B[y, z, c, x] * C[v, e, y, b] * D[a, w, z]
-```
-or
-```julia
-@tensor E[:] := A[1, 2, -4, 3] * B[4, 5, -3, 3] * C[1, -5, 4, -2] * D[-1, 2, 5]
-```
-where the latter syntax is known as NCON-style, and labels the unconnected or outgoing indices with negative integers, and the contracted indices with positive integers.
-
-A number of remarks are in order.
-TensorOperations.jl accepts both integers and any valid variable name as dummy label for indices, and everything in between `[ ]` is not resolved in the current context but interpreted as a dummy label.
-Here, we label the indices of a `TensorMap`, like `A::TensorMap{T, S, N₁, N₂}`, in a linear fashion, where the first position corresponds to the first space in `codomain(A)`, and so forth, up to position `N₁`.
-Index `N₁ + 1` then corresponds to the first space in `domain(A)`.
-However, because we have applied the coevaluation ``η``, it actually corresponds to the corresponding dual space, in accordance with the interface of [`space(A, i)`](@ref) that we introduced [above](@ref ss_tensor_properties), and as indiated by the dotted box around ``A`` in the above picture.
-The same holds for the other tensor maps.
-Note that our convention also requires that we braid indices that we brought from the domain to the codomain, and so this is only unambiguous for a symmetric braiding, where there is a unique way to permute the indices.
-
-With the current syntax, we create a new object `E` because we use the definition operator `:=`.
-Furthermore, with the current syntax, it will be a `Tensor`, i.e. it will have a trivial domain, and correspond to the dotted box in the picture above, rather than the actual morphism `E`.
-We can also directly define `E` with the correct codomain and domain by rather using
-```julia
-@tensor E[a b c;d e] := A[v, w, d, x] * B[y, z, c, x] * C[v, e, y, b] * D[a, w, z]
-```
-or
-```julia
-@tensor E[(a, b, c);(d, e)] := A[v, w, d, x] * B[y, z, c, x] * C[v, e, y, b] * D[a, w, z]
-```
-where the latter syntax can also be used when the codomain is empty.
-When using the assignment operator `=`, the `TensorMap` `E` is assumed to exist and the contents will be written to the currently allocated memory.
-Note that for existing tensors, both on the left hand side and right hand side, trying to specify the indices in the domain and the codomain seperately using the above syntax, has no effect, as the bipartition of indices are already fixed by the existing object.
-Hence, if `E` has been created by the previous line of code, all of the following lines are now equivalent
-```julia
-@tensor E[(a, b, c);(d, e)] = A[v, w, d, x] * B[y, z, c, x] * C[v, e, y, b] * D[a, w, z]
-@tensor E[a, b, c, d, e] = A[v w d; x] * B[(y, z, c); (x, )] * C[v e y; b] * D[a, w, z]
-@tensor E[a b; c d e] = A[v; w d x] * B[y, z, c, x] * C[v, e, y, b] * D[a w; z]
-```
-and none of those will or can change the partition of the indices of `E` into its codomain and its domain.
-
-Two final remarks are in order.
-Firstly, the order of the tensors appearing on the right hand side is irrelevant, as we can reorder them by using the allowed moves of the Penrose graphical calculus, which yields some crossings and a twist.
-As the latter is trivial, it can be omitted, and we just use the same rules to evaluate the newly ordered tensor network.
-For the particular case of matrix-matrix multiplication, which also captures more general settings by appropriotely combining spaces into a single line, we indeed find
-
-```@raw html
-<img src="../img/tensor-contractionreorder.svg" alt="tensor contraction reorder" class="color-invertible"/>
-```
-
-or thus, the following two lines of code yield the same result
-```julia
-@tensor C[i, j] := B[i, k] * A[k, j]
-@tensor C[i, j] := A[k, j] * B[i, k]
-```
-Reordering of tensors can be used internally by the `@tensor` macro to evaluate the contraction in a more efficient manner.
-In particular, the NCON-style of specifying the contraction gives the user control over the order, and there are other macros, such as `@tensoropt`, that try to automate this process.
-There is also an `@ncon` macro and `ncon` function, an we recommend reading the [manual of TensorOperations.jl](https://quantumkithub.github.io/TensorOperations.jl/stable/) to learn more about the possibilities and how they work.
-
-A final remark involves the use of adjoints of tensors.
-The current framework is such that the user should not be too worried about the actual bipartition into codomain and domain of a given `TensorMap` instance.
-Indeed, for tensor contractions the `@tensor` macro figures out the correct manipulations automatically.
-However, when wanting to use the `adjoint` of an instance `t::TensorMap{T, S, N₁, N₂}`, the resulting `adjoint(t)` is an `AbstractTensorMap{T, S, N₂, N₁}` and one needs to know the values of `N₁` and `N₂` to know exactly where the `i`th index of `t` will end up in `adjoint(t)`, and hence the index order of `t'`.
-Within the `@tensor` macro, one can instead use `conj()` on the whole index expression so as to be able to use the original index ordering of `t`.
-For example, for `TensorMap{T, S, 1, 1}` instances, this yields exactly the equivalence one expects, namely one between the following two expressions:
-
-```julia
-@tensor C[i, j] := B'[i, k] * A[k, j]
-@tensor C[i, j] := conj(B[k, i]) * A[k, j]
-```
-
-For e.g. an instance `A::TensorMap{T, S, 3, 2}`, the following two syntaxes have the same effect within an `@tensor` expression: `conj(A[a, b, c, d, e])` and `A'[d, e, a, b, c]`.
-
-## Fermionic tensor contractions
-
-Whenever `BraidingStyle(i) == Fermionic()`, some complications come up.
-The most important distinction from the `Bosonic()` case is that twists are no longer trivial, such that we must be careful about how we can manipulate network diagrams.
-
-To illustrate these complications, we take a look at a concrete example first, and study the following tensor network:
-
-```@raw html
-<img src="../img/tensor-fermioniccontraction.svg" alt="fermionic contraction example" class="color-invertible"/>
-```
-
-```@example fermioncontraction
-using TensorKit # hide
-V₁ = Vect[FermionParity](0 => 1, 1 => 1)
-V₂ = Vect[FermionParity](0 => 2, 1 => 2)
-A = rand(V₁ ← V₁ ⊗ V₂)
-X = rand(V₁ ← V₁)
-B = rand(V₁ ⊗ V₂ ← V₁)
-```
-
-We can expand this into binary contractions, by first contracting `X` with `A`, and then contracting the result with `B`:
-
-```@example fermioncontraction
-AX = repartition(A, 2, 1) * X
-AXB = repartition(AX, 1, 2) * B
-```
-
-Alternatively, we could decide that we first wish to contract `A` with `B`, and only then contract the result with `X`:
-
-```@example fermioncontraction
-AB = permute(A, ((1, 2), (3,))) * permute(B, ((2,), (1, 3)))
-ABX = repartition(permute(AB, ((1, 4), (2, 3))) * repartition(X, 2, 0), 1, 1)
-```
-
-This is where the issue becomes clearer, as the results are no longer equal:
-
-```@example fermioncontraction
-AXB ≈ ABX
-```
-
-### Trivializing the twist
-
-So what happened?
-If we carefully inspect what we actually computed here, we can show that in order to deform one diagram into the other, we have to introduce a self-crossing, which then altered the result.
-While the example here is still simple to follow, in general we would like that the result of `@tensor` expressions does not depend on the input order of the tensors.
-This is especially true for larger expressions where we wish to dynamically compute the optimal contraction order, as this would alter the order in a very non-transparent manner.
-
-The way out of this effectively consists of absorbing this twist in the coevaluation map ``η``.
-This modified map ``̃η := η ∘ θ`` where ``θ`` represents the twist ensures that the result no longer depends on the order of evaluation.
-In particular, one can show that any time two tensors would swap places, we would simultaneously exchange one evaluation map ``ϵ`` for a coevaluation ``̃η``, while also incurring a twist ``θ`` such that both cancel out.
-To make this concrete, we show how our previous example now leads to a unique result:
-
-```@example fermioncontraction
-function fermion_mul(A, B)
-    return A * twist(B, findall(isdual, codomain(B).spaces))
-end
-
-# order I:
-AX = fermion_mul(repartition(A, 2, 1),  X)
-AXB = fermion_mul(repartition(AX, 1, 2) , B)
-
-# order II:
-AB = fermion_mul(permute(A, ((1, 2), (3,))), permute(B, ((2,), (1, 3))))
-ABX = repartition(fermion_mul(permute(AB, ((1, 4), (2, 3))), repartition(X, 2, 0)), 1, 1)
-
-AXB ≈ ABX
-```
-
-This is the so-called **supertrace** formalism, and is effectively what `@tensor` ends up implementing for fermionic contractions.
-For more details about this formalism, we refer to [^Mortier].
-
-```@example fermioncontraction
-# @tensor
-@tensor result[-1; -2] := A[-1; 1 2] * X[1; 3] * B[3 2; -2]
-
-AXB ≈ result
-```
-
-### (Non)-unitarity
-
-While this modified ``̃η`` solves the issues related to contractions, it does come at a cost.
-The main issue is that this map does not constitute a positive definite map, and in particular is at odds with a positive inner product.
-Such a positive inner product is however required to properly define (orthogonal) factorizations, non-negative norms, etc.
-
-Therefore, we reserve the supertrace formalism exclusively for tensor contractions.
-For matrix-like operations such as factorizations, matrix functions, norms, etc, we retain the positive definite inner product.
-It is also always possible to manually emulate one or the other, by inserting appropriate calls to `twist`.
-In what follows, we simply showcase some noteworthy differences between the two formalisms, as these can be a common source of errors.
-Throughout, we use the following simple fermionic tensor as a running example:
-
-```@example fermionnorm
-using TensorKit # hide
-V = Vect[FermionParity](0 => 1, 1 => 1)
-t = ones(V' ← V')
-```
-
-- Computing a norm via a contraction:
-  the squared norm of `t`, computed via the supertrace contraction, no longer agrees with `norm(t)^2`.
-  In particular, the `@tensor` self-contraction can even vanish for a manifestly non-zero tensor:
-
-```@example fermionnorm
-norm(t)^2, @tensor conj(t[a; b]) * t[a; b]
-```
-
-Inserting a `twist` on the contracted codomain index cancels the twist that `@tensor` automatically introduces, and recovers the trace-formalism result:
-
-```@example fermionnorm
-norm(t)^2 ≈ @tensor conj(t[a; b]) * twist(t, 1)[a; b]
-```
-
-- Using unitarity to simplify `U * U' ≈ I`:
-  the factor `U` returned by `svd_compact` is left-isometric in the *trace* sense, i.e. `U' * U ≈ id(domain(U))` as a matrix product, but this identity no longer holds when the same product is written as a tensor contraction:
-
-```@example fermionnorm
-U, S, Vᴴ = svd_compact(t)
-@tensor UdU[i; j] := conj(U[k; i]) * U[k; j]
-U' * U ≈ id(domain(U)), UdU ≈ id(domain(U))
-```
-
-The matrix-mul version satisfies orthogonality, but the `@tensor` version differs by the fermionic twist on the contracted index.
-This is a common pitfall whenever an isometry obtained from a factorization is fed straight into an `@tensor` expression.
-
-- Computing a matrix function through a manual Taylor expansion:
-  matrix functions such as `exp`, `log`, `sqrt` are defined through the matrix product (trace formalism) and therefore have no immediate counterpart in terms of `@tensor` expressions.
-  In particular, replacing each matrix power by an `@tensor` self-contraction yields a different result, even at low order:
-
-```@example fermionnorm
-function exp_via_tensor(t, order)
-    out = id(domain(t))
-    tn = id(domain(t))
-    for n in 1:order
-        @tensor next[a; b] := tn[a; c] * t[c; b]
-        tn = next
-        out += tn / factorial(n)
-    end
-    return out
-end
-exp(t) ≈ exp_via_tensor(t, 10)
-```
-
-The same Taylor expansion written with the matrix product instead does reproduce `exp(t)`, confirming that the discrepancy is in the contraction step rather than the truncation order:
-
-```@example fermionnorm
-exp_via_mul(t, order) = sum(t^n / factorial(n) for n in 0:order)
-exp(t) ≈ exp_via_mul(t, 10)
-```
-
-!!! note
-    Both the supertrace and the trace formalism constitute valid, consistent frameworks, each with their own advantages and disadvantages.
-    For practical applications, it can be convenient to select one or the other, and to take special care when trying to use properties of one framework in the other.
-    In general, each case must be carefully evaluated to check which framework is correct, but a good rule of thumb is to be careful when using properties of orthogonality in combination with `@tensor` expressions.
-
-
-## Anyonic tensor contractions
-
-When `BraidingStyle(I) == Anyonic()`, the situation is more restrictive still.
-The relevant group describing the exchange of two lines is no longer the permutation group but the full braid group, so even a double crossing is non-trivial and there is no preferred way to reorder lines in a diagram.
-As a consequence, the implicit reordering that `@tensor` performs is no longer well-defined, and attempting an anyonic contraction with `@tensor` raises a `SectorMismatch` error.
-
-```@example anyoncontraction
-using TensorKit # hide
-V = Vect[FibonacciAnyon](:I => 1, :τ => 1)
-A = randn(ComplexF64, V ← V ⊗ V)
-B = randn(ComplexF64, V ⊗ V ← V)
-try
-    @tensor C[i; j] := A[i; k l] * B[k l; j]
-catch err
-    err
-end
-```
-
-The way out is to write the contraction as a literal *planar* diagram, in which every required crossing is made explicit through a braiding tensor.
-This is what the `@planar` macro provides.
-
-### The `@planar` macro
-
-The surface syntax of `@planar` is identical to that of `@tensor`, but with a number of additional restrictions.
-
-A diagram is *planar* in this context when it can be drawn on a sheet of paper without any of its lines crossing, and additionally with all open legs ending on the exterior of the diagram.
-The second condition rules out arrangements in which an open leg is enclosed by contracted ones, even if the resulting diagram itself contains no crossings.
-
-For the macro to recognise this layout unambiguously, the codomain–domain separator `;` must be present in every index list.
-It fixes which legs sit on the top (codomain) and which on the bottom (domain) of each tensor box, and changing the partition can change whether a given index pattern is planar.
-
-Planarity is moreover enforced for each binary contraction, not only for the overall expression.
-The pairwise contraction order can therefore matter: an expression whose final layout is planar may still be rejected when an intermediate contraction produces a non-planar subdiagram.
-Manually controlling the order, for instance via parentheses, NCON-style numbering, or the `order=...` keyword, is still supported, but must be done with care.
-
-Finally, the name `τ` is reserved for the braiding tensor: every literal crossing must be written out as a `τ[a b; c d]` factor, with its adjoint `τ'[a b; c d]` representing the inverse (under-)crossing.
-The `BraidingTensor` itself does not need to be constructed by the user; the macro figures out the appropriate spaces from the surrounding contraction.
-Any layout the macro cannot identify as planar is rejected at parse time with `ArgumentError("not a planar diagram expression: ...")`.
-
-To make this concrete, consider the contraction `A * B` for two anyonic tensors, written in a manifestly planar fashion:
-
-```@example anyoncontraction
-@planar C1[i; j] := A[i; k l] * B[k l; j]
-```
-
-Inserting an explicit braiding tensor on the contracted legs gives a genuinely different result, reflecting the non-trivial R-symbols of the anyon braiding:
-
-```@example anyoncontraction
-@planar C2[i; j] := A[i; k l] * τ[k l; m n] * B[m n; j]
-C1 ≈ C2
-```
-
-Both expressions correspond to valid, but distinct, tensor network diagrams, and the choice between them must be made explicit by the user.
-
-### The `@plansor` macro
-
-For code that should work uniformly across braiding styles, TensorKit provides the `@plansor` macro.
-It inspects the `BraidingStyle` of the first non-braiding tensor in the expression and dispatches to `@tensor` for `Bosonic` sectors, and to `@planar` otherwise.
-Any explicit `τ` factors that appear in the expression are silently removed in the bosonic case, where braidings are trivial, and faithfully evaluated otherwise.
-This makes `@plansor` the natural choice for generic library code that wishes to remain correct regardless of the underlying symmetry.
-
-
-[^Mortier]:      Mortier, Q., Devos, L., Burgelman, L., et al. (2025). Fermionic Tensor Network Methods. SciPost Physics 18, no. 1. [10.21468/SciPostPhys.18.1.012](https://doi.org/10.21468/SciPostPhys.18.1.012).
diff --git a/docs/src/man/factorizations.md b/docs/src/man/factorizations.md
deleted file mode 100644
index 2fbd8b382..000000000
--- a/docs/src/man/factorizations.md
+++ /dev/null
@@ -1,46 +0,0 @@
-# [Tensor factorizations](@id ss_tensor_factorization)
-
-```@setup tensors
-using TensorKit
-using LinearAlgebra
-```
-
-As tensors are linear maps, they suport various kinds of factorizations.
-These functions all interpret the provided `AbstractTensorMap` instances as a map from `domain` to `codomain`, which can be thought of as reshaping the tensor into a matrix according to the current bipartition of the indices.
-
-TensorKit's factorizations are provided by [MatrixAlgebraKit.jl](https://github.com/QuantumKitHub/MatrixAlgebraKit.jl), which is used to supply both the interface, as well as the implementation of the various operations on the blocks of data.
-For specific details on the provided functionality, we refer to its [documentation page](https://quantumkithub.github.io/MatrixAlgebraKit.jl/stable/user_interface/decompositions/).
-
-Finally, note that each of the factorizations takes the current partition of `domain` and `codomain` as the *axis* along which to matricize and perform the factorization.
-In order to obtain factorizations according to a different bipartition of the indices, we can use any of the previously mentioned [index manipulations](@ref s_indexmanipulations) before the factorization.
-
-Some examples to conclude this section
-```@repl tensors
-V1 = SU₂Space(0 => 2, 1/2 => 1)
-V2 = SU₂Space(0 => 1, 1/2 => 1, 1 => 1)
-
-t = randn(V1 ⊗ V1, V2);
-U, S, Vh = svd_compact(t);
-t ≈ U * S * Vh
-D, V = eigh_full(t' * t);
-D ≈ S * S
-U' * U ≈ id(domain(U))
-S
-
-Q, R = left_orth(t; alg = :svd);
-Q' * Q ≈ id(domain(Q))
-t ≈ Q * R
-
-U2, S2, Vh2, ε = svd_trunc(t; trunc = truncspace(V1));
-Vh2 * Vh2' ≈ id(codomain(Vh2))
-S2
-ε ≈ norm(block(S, Irrep[SU₂](1))) * sqrt(dim(Irrep[SU₂](1)))
-
-L, Q = right_orth(permute(t, ((1,), (2, 3))));
-codomain(L), domain(L), domain(Q)
-Q * Q'
-P = Q' * Q;
-P ≈ P * P
-t′ = permute(t, ((1,), (2, 3)));
-t′ ≈ t′ * P
-```
diff --git a/docs/src/man/img/tensor-fermioniccontraction.svg b/docs/src/man/img/tensor-fermioniccontraction.svg
deleted file mode 100644
index c0ec84cf9..000000000
--- a/docs/src/man/img/tensor-fermioniccontraction.svg
+++ /dev/null
@@ -1,317 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<svg
-   width="66.532455"
-   height="206.77463"
-   style="fill:none"
-   version="1.1"
-   id="svg212"
-   sodipodi:docname="tensor-fermioniccontraction.svg"
-   inkscape:version="1.4.2 (ebf0e940, 2025-05-08)"
-   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
-   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
-   xmlns="http://www.w3.org/2000/svg"
-   xmlns:svg="http://www.w3.org/2000/svg">
-  <defs
-     id="defs212">
-    <rect
-       x="116.36719"
-       y="157.54327"
-       width="15.913462"
-       height="17.106972"
-       id="rect215" />
-    <rect
-       x="135.46333"
-       y="227.16466"
-       width="14.322115"
-       height="15.117788"
-       id="rect214" />
-    <rect
-       x="127.90445"
-       y="88.717545"
-       width="27.052885"
-       height="15.117788"
-       id="rect213" />
-  </defs>
-  <sodipodi:namedview
-     id="namedview212"
-     pagecolor="#ffffff"
-     bordercolor="#000000"
-     borderopacity="0.25"
-     inkscape:showpageshadow="2"
-     inkscape:pageopacity="0.0"
-     inkscape:pagecheckerboard="0"
-     inkscape:deskcolor="#d1d1d1"
-     inkscape:zoom="1.7773802"
-     inkscape:cx="87.769629"
-     inkscape:cy="237.70941"
-     inkscape:window-width="3008"
-     inkscape:window-height="1634"
-     inkscape:window-x="1512"
-     inkscape:window-y="30"
-     inkscape:window-maximized="0"
-     inkscape:current-layer="svg212"
-     showguides="false" />
-  <g
-     id="g165"
-     style="fill:none;stroke:#000000"
-     transform="translate(-17.860179,-7.997986)" />
-  <g
-     id="g167"
-     transform="translate(-17.860179,-7.997986)" />
-  <g
-     id="g168"
-     transform="translate(-17.860179,-7.997986)" />
-  <g
-     id="g169"
-     transform="translate(-17.860179,-7.997986)" />
-  <g
-     id="g215"
-     transform="translate(-105.62981,-62.010066)">
-    <g
-       class="composite-shape"
-       id="g4"
-       style="fill:none;stroke:#000000"
-       transform="translate(-23.870192,-36.600962)">
-      <path
-         class="real"
-         d="m 130,193 c 0,-3.31 2.69,-6 6,-6 h 28 l 6,6 v 24 h -40 z"
-         style="fill:none;stroke:#000000;stroke-width:1"
-         id="path4" />
-    </g>
-    <g
-       class="composite-shape"
-       id="g123"
-       style="fill:none;stroke:#000000"
-       transform="translate(61.662264,44.557692)">
-      <path
-         class="real"
-         d="m 50,43 c 0,-3.31 2.69,-6 6,-6 h 48 l 6,6 V 67 H 50 Z"
-         style="fill:none;stroke:#000000;stroke-width:1"
-         id="path122" />
-    </g>
-    <g
-       class="composite-shape"
-       id="g123-9"
-       style="fill:none;stroke:#000000"
-       transform="translate(61.662264,181.99039)">
-      <path
-         class="real"
-         d="m 50,43 c 0,-3.31 2.69,-6 6,-6 h 48 l 6,6 V 67 H 50 Z"
-         style="fill:none;stroke:#000000;stroke-width:1"
-         id="path122-0" />
-    </g>
-    <g
-       class="arrow-line"
-       id="g126"
-       style="fill:none;stroke:#000000"
-       transform="translate(42.102416,181.78469)">
-      <path
-         class="connection real"
-         stroke-dasharray="none"
-         d="M 100,67 V 87"
-         style="fill:none;stroke:#000000;stroke-width:1"
-         id="path125" />
-    </g>
-    <g
-       class="arrow-line"
-       id="g128"
-       style="fill:none;stroke:#000000"
-       transform="translate(51.19895,45.010066)">
-      <path
-         class="connection real"
-         stroke-dasharray="none"
-         d="M 90,17 V 37"
-         style="fill:none;stroke:#000000;stroke-width:1"
-         id="path127" />
-    </g>
-    <g
-       id="g213">
-      <g
-         id="g212">
-        <g
-           class="arrow-line"
-           id="g124"
-           style="fill:none;stroke:#000000"
-           transform="matrix(1,0,0,1.9349158,65.245192,-18.081671)">
-          <path
-             class="connection real"
-             stroke-dasharray="none"
-             d="M 60,67 V 87"
-             style="fill:none;stroke:#000000;stroke-width:1"
-             id="path123" />
-        </g>
-      </g>
-      <g
-         class="arrow-line"
-         id="g130"
-         style="fill:none;stroke:#000000"
-         transform="translate(65.245193,49.331731)">
-        <path
-           class="connection real"
-           stroke-dasharray="none"
-           d="m 60,77 v 7"
-           style="fill:none;stroke:#000000;stroke-width:1"
-           id="path128" />
-        <g
-           stroke="none"
-           fill="#000"
-           transform="rotate(-90,73.5,13.5)"
-           style="fill:#000000;stroke:none;stroke-width:1"
-           id="g129">
-          <path
-             d="M 8.93,-4.29 0,0 8.93,4.29 Z"
-             id="path129" />
-        </g>
-      </g>
-    </g>
-    <g
-       id="g212-8"
-       transform="matrix(1,0,0,2.7682393,35.839281,-196.88329)"
-       style="fill:none">
-      <g
-         class="arrow-line"
-         id="g124-1"
-         style="fill:none;stroke:#000000"
-         transform="matrix(1,0,0,1.9349158,65.245192,-18.081671)">
-        <path
-           class="connection real"
-           stroke-dasharray="none"
-           d="M 60,67 V 87"
-           style="fill:none;stroke:#000000;stroke-width:1"
-           id="path123-4" />
-      </g>
-    </g>
-    <g
-       class="arrow-line"
-       id="g130-7"
-       style="fill:none;stroke:#000000"
-       transform="translate(101.08447,86.310099)">
-      <path
-         class="connection real"
-         stroke-dasharray="none"
-         d="m 60,77 v 7"
-         style="fill:none;stroke:#000000;stroke-width:1"
-         id="path128-1" />
-      <g
-         stroke="none"
-         fill="#000"
-         transform="rotate(-90,73.5,13.5)"
-         style="fill:#000000;stroke:none;stroke-width:1"
-         id="g129-3">
-        <path
-           d="M 8.93,-4.29 0,0 8.93,4.29 Z"
-           id="path129-8" />
-      </g>
-    </g>
-    <g
-       id="g213-6"
-       style="fill:none"
-       transform="translate(-0.35302654,68.870195)">
-      <g
-         id="g212-5">
-        <g
-           class="arrow-line"
-           id="g124-7"
-           style="fill:none;stroke:#000000"
-           transform="matrix(1,0,0,1.9349158,65.245192,-18.081671)">
-          <path
-             class="connection real"
-             stroke-dasharray="none"
-             d="M 60,67 V 87"
-             style="fill:none;stroke:#000000;stroke-width:1"
-             id="path123-6" />
-        </g>
-      </g>
-      <g
-         class="arrow-line"
-         id="g130-2"
-         style="fill:none;stroke:#000000"
-         transform="translate(65.245193,49.331731)">
-        <path
-           class="connection real"
-           stroke-dasharray="none"
-           d="m 60,77 v 7"
-           style="fill:none;stroke:#000000;stroke-width:1"
-           id="path128-7" />
-        <g
-           stroke="none"
-           fill="#000"
-           transform="rotate(-90,73.5,13.5)"
-           style="fill:#000000;stroke:none;stroke-width:1"
-           id="g129-0">
-          <path
-             d="M 8.93,-4.29 0,0 8.93,4.29 Z"
-             id="path129-3" />
-        </g>
-      </g>
-    </g>
-    <g
-       class="arrow-line"
-       id="g132"
-       style="fill:none;stroke:#000000"
-       transform="translate(42.102416,181.78469)">
-      <path
-         class="connection real"
-         stroke-dasharray="none"
-         d="m 100,77 v 7"
-         style="fill:none;stroke:#000000;stroke-width:1"
-         id="path130" />
-      <g
-         stroke="none"
-         fill="#000"
-         transform="rotate(-90,93.5,-6.5)"
-         style="fill:#000000;stroke:none;stroke-width:1"
-         id="g131">
-        <path
-           d="M 8.93,-4.29 0,0 8.93,4.29 Z"
-           id="path131" />
-      </g>
-    </g>
-    <g
-       class="arrow-line"
-       id="g134"
-       style="fill:none;stroke:#000000"
-       transform="translate(51.19895,45.010066)">
-      <path
-         class="connection real"
-         stroke-dasharray="none"
-         d="m 90,17 v 7"
-         style="fill:none;stroke:#000000;stroke-width:1"
-         id="path132" />
-      <g
-         stroke="none"
-         fill="#000"
-         transform="rotate(-90,58.5,-31.5)"
-         style="fill:#000000;stroke:none;stroke-width:1"
-         id="g133">
-        <path
-           d="M 8.93,-4.29 0,0 8.93,4.29 Z"
-           id="path133" />
-      </g>
-    </g>
-    <text
-       xml:space="preserve"
-       id="text213"
-       style="text-align:start;writing-mode:lr-tb;direction:ltr;white-space:pre;shape-inside:url(#rect213);display:inline;opacity:1;fill:#000000;fill-opacity:1;stroke:none;stroke-width:5.33333;paint-order:stroke markers fill"
-       transform="translate(7.9567308,1.5913462)"><tspan
-         x="127.9043"
-         y="99.664761"
-         id="tspan1">B</tspan></text>
-    <text
-       xml:space="preserve"
-       id="text214"
-       style="text-align:start;writing-mode:lr-tb;direction:ltr;white-space:pre;shape-inside:url(#rect214);display:inline;opacity:1;fill:#000000;fill-opacity:1;stroke:none;stroke-width:5.33333;paint-order:stroke markers fill"
-       transform="translate(2.3870192,0.39783654)"><tspan
-         x="135.46289"
-         y="238.11203"
-         id="tspan2">A</tspan></text>
-    <text
-       xml:space="preserve"
-       id="text215"
-       style="text-align:start;writing-mode:lr-tb;direction:ltr;white-space:pre;shape-inside:url(#rect215);display:inline;opacity:1;fill:#000000;fill-opacity:1;stroke:none;stroke-width:5.33333;paint-order:stroke markers fill"
-       transform="translate(4.7740385,0.39783654)"><tspan
-         x="116.36719"
-         y="168.49093"
-         id="tspan3">X</tspan></text>
-  </g>
-</svg>
diff --git a/docs/src/man/indexmanipulations.md b/docs/src/man/indexmanipulations.md
deleted file mode 100644
index b644eff0c..000000000
--- a/docs/src/man/indexmanipulations.md
+++ /dev/null
@@ -1,108 +0,0 @@
-# [Index manipulations](@id s_indexmanipulations)
-
-```@meta
-CollapsedDocStrings = true
-```
-
-```@setup indexmanip
-using TensorKit
-using LinearAlgebra
-```
-
-A `TensorMap{T, S, N₁, N₂}` is a linear map from a domain (a `ProductSpace{S, N₂}`) to a codomain (a `ProductSpace{S, N₁}`).
-In practice, the bipartition of the `N₁ + N₂` indices between domain and codomain rarely remains fixed: algorithms typically need to reshuffle indices between the two sides, reorder them, or change the arrow direction on individual indices before passing a tensor to a factorization or contraction.
-
-Index manipulations cover all such operations.
-They act on the structure of the tensor data in a way that is fully determined by the categorical data of the `sectortype`, such that TensorKit automatically manipulates the tensor entries accordingly.
-The operations fall into three groups, which mirror the structure of the source file:
-
-*   **Reweighting**: [`flip`](@ref) and [`twist`](@ref) apply local isomorphisms to individual indices without changing the index structure.
-*   **Space insertion/removal**: [`insertleftunit`](@ref), [`insertrightunit`](@ref) and [`removeunit`](@ref) add or remove trivial (scalar) index factors.
-*   **Index rearrangements**: [`permute`](@ref), [`braid`](@ref), [`transpose`](@ref) and [`repartition`](@ref) reorder indices and/or move them between domain and codomain.
-
-Throughout this page, new index positions are specified using `Index2Tuple{N₁, N₂}`, i.e. a pair `(p₁, p₂)` of index tuples.
-The indices listed in `p₁` form the new codomain and those in `p₂` form the new domain.
-The following helpers retrieve the current index structure of a tensor:
-
-```@docs; canonical=false
-numout
-numin
-numind
-codomainind
-domainind
-allind
-```
-
-## Reweighting
-
-Reweighting operations modify the entries of a tensor by applying local isomorphisms to individual indices, without changing the number of indices or their partition between domain and codomain.
-In particular, [`twist`](@ref) applies the topological spin (monoidal twist) to selected indices; this operation preserves the space of the indices and is completely trivial for `BraidingStyle(I) == Bosonic()`.
-In contrast, [`flip`](@ref) changes the arrow direction on selected indices by applying a (non-canonical!) isomorphism between the index space and its dual.
-
-```@docs; canonical=false
-twist(::AbstractTensorMap, ::Int)
-twist!
-flip(t::AbstractTensorMap, I)
-```
-
-## Inserting and removing unit spaces
-
-The next set of functions add or remove a trivial tensor product factor at a specified index position, without affecting any other indices.
-We distinguish between [`insertleftunit`](@ref), which inserts a unit index before index `i` (the unit index becoming index `i`),
-and [`insertrightunit`](@ref), which inserts after index `i` (the unit index becoming index `i + 1`);
-[`removeunit`](@ref) undoes either insertion.
-
-For tensors `t` with `UnitStyle(sectortype(t)) = SimpleUnit()`, the only relevant difference between `insertleftunit(t, i + 1)` and `insertrightunit(t, i)` is that `insertleftunit(t, numout(t) + 1)` inserts the unit index as first index in the domain, whereas `insertrightunit(t, numout(t))` will insert the unit index as last index in the codomain. 
-
-Passing `Val(i)` instead of an integer `i` for the position may improve type stability.
-
-```@docs; canonical=false
-insertleftunit(::AbstractTensorMap, ::Val{i}) where {i}
-insertrightunit(::AbstractTensorMap, ::Val{i}) where {i}
-removeunit(::AbstractTensorMap, ::Val{i}) where {i}
-```
-
-## Index rearrangements
-
-These operations reorder indices and/or move them between domain and codomain by applying the transposing or braiding isomorphisms of the underlying category.
-They form a hierarchy from most general to most restricted:
-
-- [`braid`](@ref) is the most general: it accepts any permutation and requires a `levels` argument — a tuple of heights, one per index — that determines whether each index crosses over or under the others it has to pass.
-- [`permute`](@ref) is a simpler interface for sector types with a symmetric braiding (`BraidingStyle(I) isa SymmetricBraiding`), where over- and under-crossings are equivalent and `levels` is therefore not needed.
-- [`transpose`](@ref) is restricted to *cyclic* permutations (indices do not cross).
-- [`repartition`](@ref) only moves the codomain/domain boundary without reordering the indices at all.
-
-For plain tensors (`sectortype(t) == Trivial`), `permute` and `braid` act like `permutedims` on the underlying array:
-
-```@repl indexmanip
-V = ℂ^2;
-t = randn(V ⊗ V ← V ⊗ V);
-ta = convert(Array, t);
-t′ = permute(t, ((4, 2, 3), (1,)));
-convert(Array, t′) ≈ permutedims(ta, (4, 2, 3, 1))
-```
-
-```@docs; canonical=false
-braid(::AbstractTensorMap, ::Index2Tuple, ::IndexTuple)
-braid!
-permute(::AbstractTensorMap, ::Index2Tuple)
-permute!(::AbstractTensorMap, ::AbstractTensorMap, ::Index2Tuple)
-transpose(::AbstractTensorMap, ::Index2Tuple)
-transpose!
-repartition(::AbstractTensorMap, ::Int, ::Int)
-repartition!
-```
-
-## Fusing and splitting indices
-
-There is no dedicated functionality for fusing or splitting indices.
-In the general case there is no canonical embedding of `V1 ⊗ V2` into the fused space `V = fuse(V1 ⊗ V2)`: any two such embeddings differ by a basis transform, i.e. there is a gauge freedom.
-TensorKit resolves this by requiring the user to construct an explicit isomorphism — the *fuser* — and contract it with the tensor.
-One particular isomorphism can be constructed using the [`unitary](@ref) function.
-It preserves norms and inner products, and has an inverse given by its adjoint. 
-For a plain tensor (`sectortype(t) == Trivial`), applying this particular `unitary` is equivalent to `reshape` on the underlying array.
-
-Fusing index `i` and `j = i+1` of a tensor `t` is then accomplished as
-
-
-The resulting `unitary` is a dense `TensorMap`, and this fusion and splitting approach is not optimized for maximal performance. However, because many tensor operations including tensor factorizations (SVD, QR, etc.) can be applied without needing any fusion, we do not expect fusion and splitting to be an essential part of performance critical parts of typical tensor algorithms. 
diff --git a/docs/src/man/linearalgebra.md b/docs/src/man/linearalgebra.md
deleted file mode 100644
index 98c9f489a..000000000
--- a/docs/src/man/linearalgebra.md
+++ /dev/null
@@ -1,85 +0,0 @@
-# [Basic linear algebra](@id ss_tensor_linalg)
-
-```@setup tensors
-using TensorKit
-using LinearAlgebra
-```
-
-`AbstractTensorMap` instances `t` represent linear maps, i.e. homomorphisms in a `𝕜`-linear category, just like matrices.
-To a large extent, they follow the interface of `Matrix` in Julia's `LinearAlgebra` standard library.
-Many methods from `LinearAlgebra` are (re)exported by TensorKit.jl, and can then us be used without `using LinearAlgebra` explicitly.
-In all of the following methods, the implementation acts directly on the underlying matrix blocks (typically using the same method) and never needs to perform any basis transforms.
-
-In particular, `AbstractTensorMap` instances can be composed, provided the domain of the first object coincides with the codomain of the second.
-Composing tensor maps uses the regular multiplication symbol as in `t = t1 * t2`, which is also used for matrix multiplication.
-TensorKit.jl also supports (and exports) the mutating method `mul!(t, t1, t2)`.
-We can then also try to invert a tensor map using `inv(t)`, though this can only exist if the domain and codomain are isomorphic, which can e.g. be checked as `fuse(codomain(t)) == fuse(domain(t))`.
-If the inverse is composed with another tensor `t2`, we can use the syntax `t1 \ t2` or `t2 / t1`.
-However, this syntax also accepts instances `t1` whose domain and codomain are not isomorphic, and then amounts to `pinv(t1)`, the Moore-Penrose pseudoinverse.
-This, however, is only really justified as minimizing the least squares problem if `InnerProductStyle(t) <: EuclideanProduct`.
-
-`AbstractTensorMap` instances behave themselves as vectors (i.e. they are `𝕜`-linear) and so they can be multiplied by scalars and, if they live in the same space, i.e. have the same domain and codomain, they can be added to each other.
-There is also a `zero(t)`, the additive identity, which produces a zero tensor with the same domain and codomain as `t`.
-In addition, `TensorMap` supports basic Julia methods such as `fill!` and `copy!`, as well as `copy(t)` to create a copy with independent data.
-Aside from basic `+` and `*` operations, TensorKit.jl reexports a number of efficient in-place methods from `LinearAlgebra`, such as `axpy!` (for `y ← α * x + y`), `axpby!` (for `y ← α * x + β * y`), `lmul!` and `rmul!` (for `y ← α * y` and `y ← y * α`, which is typically the same) and `mul!`, which can also be used for out-of-place scalar multiplication `y ← α * x`.
-
-For `S = spacetype(t)` where `InnerProductStyle(S) <: EuclideanProduct`, we can compute `norm(t)`, and for two such instances, the inner product `dot(t1, t2)`, provided `t1` and `t2` have the same domain and codomain.
-Furthermore, there is `normalize(t)` and `normalize!(t)` to return a scaled version of `t` with unit norm.
-These operations should also exist for `InnerProductStyle(S) <: HasInnerProduct`, but require an interface for defining a custom inner product in these spaces.
-Currently, there is no concrete subtype of `HasInnerProduct` that is not an `EuclideanProduct`.
-In particular, `CartesianSpace`, `ComplexSpace` and `GradedSpace` all have `InnerProductStyle(S) <: EuclideanProduct`.
-
-With tensors that have `InnerProductStyle(t) <: EuclideanProduct` there is associated an adjoint operation, given by `adjoint(t)` or simply `t'`, such that `domain(t') == codomain(t)` and `codomain(t') == domain(t)`.
-Note that for an instance `t::TensorMap{S, N₁, N₂}`, `t'` is simply stored in a wrapper called `AdjointTensorMap{S, N₂, N₁}`, which is another subtype of `AbstractTensorMap`.
-This should be mostly invisible to the user, as all methods should work for this type as well.
-It can be hard to reason about the index order of `t'`, i.e. index `i` of `t` appears in `t'` at index position `j = TensorKit.adjointtensorindex(t, i)`, where the latter method is typically not necessary and hence unexported.
-There is also a plural `TensorKit.adjointtensorindices` to convert multiple indices at once.
-Note that, because the adjoint interchanges domain and codomain, we have `space(t', j) == space(t, i)'`.
-
-`AbstractTensorMap` instances can furthermore be tested for exact (`t1 == t2`) or approximate (`t1 ≈ t2`) equality, though the latter requires that `norm` can be computed.
-
-When tensor map instances are endomorphisms, i.e. they have the same domain and codomain, there is a multiplicative identity which can be obtained as `one(t)` or `one!(t)`, where the latter overwrites the contents of `t`.
-The multiplicative identity on a space `V` can also be obtained using `id(A, V)` as discussed [above](@ref ss_tensor_construction), such that for a general homomorphism `t′`, we have `t′ == id(codomain(t′)) * t′ == t′ * id(domain(t′))`.
-Returning to the case of endomorphisms `t`, we can compute the trace via `tr(t)` and exponentiate them using `exp(t)`, or if the contents of `t` can be destroyed in the process, `exp!(t)`.
-Furthermore, there are a number of tensor factorizations for both endomorphisms and general homomorphisms that we discuss on the [Tensor factorizations](@ref ss_tensor_factorization) page.
-
-Finally, there are a number of operations that also belong in this paragraph because of their analogy to common matrix operations.
-The tensor product of two `TensorMap` instances `t1` and `t2` is obtained as `t1 ⊗ t2` and results in a new `TensorMap` with `codomain(t1 ⊗ t2) = codomain(t1) ⊗ codomain(t2)` and `domain(t1 ⊗ t2) = domain(t1) ⊗ domain(t2)`.
-If we have two `TensorMap{T, S, N, 1}` instances `t1` and `t2` with the same codomain, we can combine them in a way that is analogous to `hcat`, i.e. we stack them such that the new tensor `catdomain(t1, t2)` has also the same codomain, but has a domain which is `domain(t1) ⊕ domain(t2)`.
-Similarly, if `t1` and `t2` are of type `TensorMap{T, S, 1, N}` and have the same domain, the operation `catcodomain(t1, t2)` results in a new tensor with the same domain and a codomain given by `codomain(t1) ⊕ codomain(t2)`, which is the analogy of `vcat`.
-Note that direct sum only makes sense between `ElementarySpace` objects, i.e. there is no way to give a tensor product meaning to a direct sum of tensor product spaces.
-
-Time for some more examples:
-```@repl tensors
-using TensorKit # hide
-V1 = ℂ^2
-t = randn(V1 ← V1 ⊗ V1 ⊗ V1)
-t == t + zero(t) == t * id(domain(t)) == id(codomain(t)) * t
-t2 = randn(ComplexF64, codomain(t), domain(t));
-dot(t2, t)
-tr(t2' * t)
-dot(t2, t) ≈ dot(t', t2')
-dot(t2, t2)
-norm(t2)^2
-t3 = copy!(similar(t, ComplexF64), t);
-t3 == t
-rmul!(t3, 0.8);
-t3 ≈ 0.8 * t
-axpby!(0.5, t2, 1.3im, t3);
-t3 ≈ 0.5 * t2 + 0.8 * 1.3im * t
-t4 = randn(fuse(codomain(t)), codomain(t));
-t5 = TensorMap{Float64}(undef, fuse(codomain(t)), domain(t));
-mul!(t5, t4, t) == t4 * t
-inv(t4) * t4 ≈ id(codomain(t))
-t4 * inv(t4) ≈ id(fuse(codomain(t)))
-t4 \ (t4 * t) ≈ t
-t6 = randn(ComplexF64, V1, codomain(t));
-numout(t4) == numout(t6) == 1
-t7 = catcodomain(t4, t6);
-foreach(println, (codomain(t4), codomain(t6), codomain(t7)))
-norm(t7) ≈ sqrt(norm(t4)^2 + norm(t6)^2)
-t8 = t4 ⊗ t6;
-foreach(println, (codomain(t4), codomain(t6), codomain(t8)))
-foreach(println, (domain(t4), domain(t6), domain(t8)))
-norm(t8) ≈ norm(t4)*norm(t6)
-```
diff --git a/docs/src/man/tensormanipulations.md b/docs/src/man/tensormanipulations.md
new file mode 100644
index 000000000..15285fd78
--- /dev/null
+++ b/docs/src/man/tensormanipulations.md
@@ -0,0 +1,337 @@
+# [Manipulating tensors](@id s_tensormanipulations)
+
+## [Vector space and linear algebra operations](@id ss_tensor_linalg)
+
+`AbstractTensorMap` instances `t` represent linear maps, i.e. homomorphisms in a `𝕜`-linear category, just like matrices.
+To a large extent, they follow the interface of `Matrix` in Julia's `LinearAlgebra` standard library.
+Many methods from `LinearAlgebra` are (re)exported by TensorKit.jl, and can then us be used without `using LinearAlgebra` explicitly.
+In all of the following methods, the implementation acts directly on the underlying matrix blocks (typically using the same method) and never needs to perform any basis transforms.
+
+In particular, `AbstractTensorMap` instances can be composed, provided the domain of the first object coincides with the codomain of the second.
+Composing tensor maps uses the regular multiplication symbol as in `t = t1 * t2`, which is also used for matrix multiplication.
+TensorKit.jl also supports (and exports) the mutating method `mul!(t, t1, t2)`.
+We can then also try to invert a tensor map using `inv(t)`, though this can only exist if the domain and codomain are isomorphic, which can e.g. be checked as `fuse(codomain(t)) == fuse(domain(t))`.
+If the inverse is composed with another tensor `t2`, we can use the syntax `t1 \ t2` or `t2 / t1`.
+However, this syntax also accepts instances `t1` whose domain and codomain are not isomorphic, and then amounts to `pinv(t1)`, the Moore-Penrose pseudoinverse.
+This, however, is only really justified as minimizing the least squares problem if `InnerProductStyle(t) <: EuclideanProduct`.
+
+`AbstractTensorMap` instances behave themselves as vectors (i.e. they are `𝕜`-linear) and so they can be multiplied by scalars and, if they live in the same space, i.e. have the same domain and codomain, they can be added to each other.
+There is also a `zero(t)`, the additive identity, which produces a zero tensor with the same domain and codomain as `t`.
+In addition, `TensorMap` supports basic Julia methods such as `fill!` and `copy!`, as well as `copy(t)` to create a copy with independent data.
+Aside from basic `+` and `*` operations, TensorKit.jl reexports a number of efficient in-place methods from `LinearAlgebra`, such as `axpy!` (for `y ← α * x + y`), `axpby!` (for `y ← α * x + β * y`), `lmul!` and `rmul!` (for `y ← α * y` and `y ← y * α`, which is typically the same) and `mul!`, which can also be used for out-of-place scalar multiplication `y ← α * x`.
+
+For `S = spacetype(t)` where `InnerProductStyle(S) <: EuclideanProduct`, we can compute `norm(t)`, and for two such instances, the inner product `dot(t1, t2)`, provided `t1` and `t2` have the same domain and codomain.
+Furthermore, there is `normalize(t)` and `normalize!(t)` to return a scaled version of `t` with unit norm.
+These operations should also exist for `InnerProductStyle(S) <: HasInnerProduct`, but require an interface for defining a custom inner product in these spaces.
+Currently, there is no concrete subtype of `HasInnerProduct` that is not an `EuclideanProduct`.
+In particular, `CartesianSpace`, `ComplexSpace` and `GradedSpace` all have `InnerProductStyle(S) <: EuclideanProduct`.
+
+With tensors that have `InnerProductStyle(t) <: EuclideanProduct` there is associated an adjoint operation, given by `adjoint(t)` or simply `t'`, such that `domain(t') == codomain(t)` and `codomain(t') == domain(t)`.
+Note that for an instance `t::TensorMap{S, N₁, N₂}`, `t'` is simply stored in a wrapper called `AdjointTensorMap{S, N₂, N₁}`, which is another subtype of `AbstractTensorMap`.
+This should be mostly invisible to the user, as all methods should work for this type as well.
+It can be hard to reason about the index order of `t'`, i.e. index `i` of `t` appears in `t'` at index position `j = TensorKit.adjointtensorindex(t, i)`, where the latter method is typically not necessary and hence unexported.
+There is also a plural `TensorKit.adjointtensorindices` to convert multiple indices at once.
+Note that, because the adjoint interchanges domain and codomain, we have `space(t', j) == space(t, i)'`.
+
+`AbstractTensorMap` instances can furthermore be tested for exact (`t1 == t2`) or approximate (`t1 ≈ t2`) equality, though the latter requires that `norm` can be computed.
+
+When tensor map instances are endomorphisms, i.e. they have the same domain and codomain, there is a multiplicative identity which can be obtained as `one(t)` or `one!(t)`, where the latter overwrites the contents of `t`.
+The multiplicative identity on a space `V` can also be obtained using `id(A, V)` as discussed [above](@ref ss_tensor_construction), such that for a general homomorphism `t′`, we have `t′ == id(codomain(t′)) * t′ == t′ * id(domain(t′))`.
+Returning to the case of endomorphisms `t`, we can compute the trace via `tr(t)` and exponentiate them using `exp(t)`, or if the contents of `t` can be destroyed in the process, `exp!(t)`.
+Furthermore, there are a number of tensor factorizations for both endomorphisms and general homomorphism that we discuss below.
+
+Finally, there are a number of operations that also belong in this paragraph because of their analogy to common matrix operations.
+The tensor product of two `TensorMap` instances `t1` and `t2` is obtained as `t1 ⊗ t2` and results in a new `TensorMap` with `codomain(t1 ⊗ t2) = codomain(t1) ⊗ codomain(t2)` and `domain(t1 ⊗ t2) = domain(t1) ⊗ domain(t2)`.
+If we have two `TensorMap{T, S, N, 1}` instances `t1` and `t2` with the same codomain, we can combine them in a way that is analogous to `hcat`, i.e. we stack them such that the new tensor `catdomain(t1, t2)` has also the same codomain, but has a domain which is `domain(t1) ⊕ domain(t2)`.
+Similarly, if `t1` and `t2` are of type `TensorMap{T, S, 1, N}` and have the same domain, the operation `catcodomain(t1, t2)` results in a new tensor with the same domain and a codomain given by `codomain(t1) ⊕ codomain(t2)`, which is the analogy of `vcat`.
+Note that direct sum only makes sense between `ElementarySpace` objects, i.e. there is no way to give a tensor product meaning to a direct sum of tensor product spaces.
+
+Time for some more examples:
+```@repl tensors
+using TensorKit # hide
+V1 = ℂ^2
+t = randn(V1 ← V1 ⊗ V1 ⊗ V1)
+t == t + zero(t) == t * id(domain(t)) == id(codomain(t)) * t
+t2 = randn(ComplexF64, codomain(t), domain(t));
+dot(t2, t)
+tr(t2' * t)
+dot(t2, t) ≈ dot(t', t2')
+dot(t2, t2)
+norm(t2)^2
+t3 = copy!(similar(t, ComplexF64), t);
+t3 == t
+rmul!(t3, 0.8);
+t3 ≈ 0.8 * t
+axpby!(0.5, t2, 1.3im, t3);
+t3 ≈ 0.5 * t2 + 0.8 * 1.3im * t
+t4 = randn(fuse(codomain(t)), codomain(t));
+t5 = TensorMap{Float64}(undef, fuse(codomain(t)), domain(t));
+mul!(t5, t4, t) == t4 * t
+inv(t4) * t4 ≈ id(codomain(t))
+t4 * inv(t4) ≈ id(fuse(codomain(t)))
+t4 \ (t4 * t) ≈ t
+t6 = randn(ComplexF64, V1, codomain(t));
+numout(t4) == numout(t6) == 1
+t7 = catcodomain(t4, t6);
+foreach(println, (codomain(t4), codomain(t6), codomain(t7)))
+norm(t7) ≈ sqrt(norm(t4)^2 + norm(t6)^2)
+t8 = t4 ⊗ t6;
+foreach(println, (codomain(t4), codomain(t6), codomain(t8)))
+foreach(println, (domain(t4), domain(t6), domain(t8)))
+norm(t8) ≈ norm(t4)*norm(t6)
+```
+
+## [Index manipulations](@id ss_indexmanipulation)
+
+In many cases, the bipartition of tensor indices (i.e. `ElementarySpace` instances) between the codomain and domain is not fixed throughout the different operations that need to be performed on that tensor map, i.e. we want to use the duality to move spaces from domain to codomain and vice versa.
+Furthermore, we want to use the braiding to reshuffle the order of the indices.
+
+For this, we use an interface that is closely related to that for manipulating splitting- fusion tree pairs, namely [`braid`](@ref) and [`permute`](@ref), with the interface
+
+```julia
+braid(t::AbstractTensorMap{T,S,N₁,N₂}, (p1, p2)::Index2Tuple{N₁′,N₂′}, levels::IndexTuple{N₁+N₂,Int})
+```
+
+and
+
+```julia
+permute(t::AbstractTensorMap{T,S,N₁,N₂}, (p1, p2)::Index2Tuple{N₁′,N₂′}; copy = false)
+```
+
+both of which return an instance of `AbstractTensorMap{T, S, N₁′, N₂′}`.
+
+In these methods, `p1` and `p2` specify which of the original tensor indices ranging from `1` to `N₁ + N₂` make up the new codomain (with `N₁′` spaces) and new domain (with `N₂′` spaces).
+Hence, `(p1..., p2...)` should be a valid permutation of `1:(N₁ + N₂)`.
+Note that, throughout TensorKit.jl, permutations are always specified using tuples of `Int`s, for reasons of type stability.
+For `braid`, we also need to specify `levels` or depths for each of the indices of the original tensor, which determine whether indices will braid over or underneath each other (use the braiding or its inverse).
+We refer to the section on [manipulating fusion trees](@ref ss_fusiontrees) for more details.
+
+When `BraidingStyle(sectortype(t)) isa SymmetricBraiding`, we can use the simpler interface of `permute`, which does not require the argument `levels`.
+`permute` accepts a keyword argument `copy`.
+When `copy == true`, the result will be a tensor with newly allocated data that can independently be modified from that of the input tensor `t`.
+When `copy` takes the default value `false`, `permute` can try to return the result in a way that it shares its data with the input tensor `t`, though this is only possible in specific cases (e.g. when `sectortype(S) == Trivial` and `(p1..., p2...) = (1:(N₁+N₂)...)`).
+
+Both `braid` and `permute` come in a version where the result is stored in an already existing tensor, i.e. [`braid!(tdst, tsrc, (p1, p2), levels)`](@ref) and [`permute!(tdst, tsrc, (p1, p2))`](@ref).
+
+Another operation that belongs under index manipulations is taking the `transpose` of a tensor, i.e. `LinearAlgebra.transpose(t)` and `LinearAlgebra.transpose!(tdst, tsrc)`, both of which are reexported by TensorKit.jl.
+Note that `transpose(t)` is not simply equal to reshuffling domain and codomain with `braid(t, (1:(N₁+N₂)...), reverse(domainind(tsrc)), reverse(codomainind(tsrc))))`.
+Indeed, the graphical representation (where we draw the codomain and domain as a single object), makes clear that this introduces an additional (inverse) twist, which is then compensated in the `transpose` implementation.
+
+```@raw html
+<img src="../img/tensor-transpose.svg" alt="transpose" class="color-invertible"/>
+```
+
+In categorical language, the reason for this extra twist is that we use the left coevaluation ``η``, but the right evaluation ``\tilde{ϵ}``, when repartitioning the indices between domain and codomain.
+
+There are a number of other index related manipulations.
+We can apply a twist (or inverse twist) to one of the tensor map indices via [`twist(t, i; inv = false)`](@ref) or [`twist!(t, i; inv = false)`](@ref).
+Note that the latter method does not store the result in a new destination tensor, but just modifies the tensor `t` in place.
+Twisting several indices simultaneously can be obtained by using the defining property
+
+```math
+θ_{V⊗W} = τ_{W,V} ∘ (θ_W ⊗ θ_V) ∘ τ_{V,W} = (θ_V ⊗ θ_W) ∘ τ_{W,V} ∘ τ_{V,W},
+```
+
+but is currently not implemented explicitly.
+
+For all sector types `I` with `BraidingStyle(I) == Bosonic()`, all twists are `1` and thus have no effect.
+Let us start with some examples, in which we illustrate that, albeit `permute` might act highly non-trivial on the fusion trees and on the corresponding data, after conversion to a regular `Array` (when possible), it just acts like `permutedims`
+
+```@repl tensors
+domain(t) → codomain(t)
+ta = convert(Array, t);
+t′ = permute(t, (1, 2, 3, 4));
+domain(t′) → codomain(t′)
+convert(Array, t′) ≈ ta
+t′′ = permute(t, ((4, 2, 3), (1,)));
+domain(t′′) → codomain(t′′)
+convert(Array, t′′) ≈ permutedims(ta, (4, 2, 3, 1))
+transpose(t)
+convert(Array, transpose(t)) ≈ permutedims(ta, (4, 3, 2, 1))
+dot(t2, t) ≈ dot(transpose(t2), transpose(t))
+transpose(transpose(t)) ≈ t
+twist(t, 3) ≈ t
+```
+
+Note that `transpose` acts like one would expect on a `TensorMap{T, S, 1, 1}`.
+On a `TensorMap{T, S, N₁, N₂}`, because `transpose` replaces the codomain with the dual of the domain, which has its tensor product operation reversed, this in the end amounts in a complete reversal of all tensor indices when representing it as a plain multi-dimensional `Array`.
+Also, note that we have not defined the conjugation of `TensorMap` instances.
+One definition that one could think of is `conj(t) = adjoint(transpose(t))`.
+However note that `codomain(adjoint(tranpose(t))) == domain(transpose(t)) == dual(codomain(t))` and similarly `domain(adjoint(tranpose(t))) == dual(domain(t))`, where `dual` of a `ProductSpace` is composed of the dual of the `ElementarySpace` instances, in reverse order of tensor product.
+This might be very confusing, and as such we leave tensor conjugation undefined.
+However, note that we have a conjugation syntax within the context of [tensor contractions](@ref ss_tensor_contraction).
+
+To show the effect of `twist`, we now consider a type of sector `I` for which `BraidingStyle(I) != Bosonic()`.
+In particular, we use `FibonacciAnyon`.
+We cannot convert the resulting `TensorMap` to an `Array`, so we have to rely on indirect tests to verify our results.
+
+```@repl tensors
+V1 = GradedSpace{FibonacciAnyon}(:I => 3, :τ => 2)
+V2 = GradedSpace{FibonacciAnyon}(:I => 2, :τ => 1)
+m = randn(Float32, V1, V2)
+transpose(m)
+twist(braid(m, ((2,), (1,)), (1, 2)), 1)
+t1 = randn(V1 * V2', V2 * V1);
+t2 = randn(ComplexF64, V1 * V2', V2 * V1);
+dot(t1, t2) ≈ dot(transpose(t1), transpose(t2))
+transpose(transpose(t1)) ≈ t1
+```
+
+A final operation that one might expect in this section is to fuse or join indices, and its inverse, to split a given index into two or more indices.
+For a plain tensor (i.e. with `sectortype(t) == Trivial`) amount to the equivalent of `reshape` on the multidimensional data.
+However, this represents only one possibility, as there is no canonically unique way to embed the tensor product of two spaces `V1 ⊗ V2` in a new space `V = fuse(V1 ⊗ V2)`.
+Such a mapping can always be accompagnied by a basis transform.
+However, one particular choice is created by the function `isomorphism`, or for `EuclideanProduct` spaces, `unitary`.
+Hence, we can join or fuse two indices of a tensor by first constructing `u = unitary(fuse(space(t, i) ⊗ space(t, j)), space(t, i) ⊗ space(t, j))` and then contracting this map with indices `i` and `j` of `t`, as explained in the section on [contracting tensors](@ref ss_tensor_contraction).
+Note, however, that a typical algorithm is not expected to often need to fuse and split indices, as e.g. tensor factorizations can easily be applied without needing to `reshape` or fuse indices first, as explained in the next section.
+
+## [Tensor factorizations](@id ss_tensor_factorization)
+
+As tensors are linear maps, they suport various kinds of factorizations.
+These functions all interpret the provided `AbstractTensorMap` instances as a map from `domain` to `codomain`, which can be thought of as reshaping the tensor into a matrix according to the current bipartition of the indices.
+
+TensorKit's factorizations are provided by [MatrixAlgebraKit.jl](https://github.com/QuantumKitHub/MatrixAlgebraKit.jl), which is used to supply both the interface, as well as the implementation of the various operations on the blocks of data.
+For specific details on the provided functionality, we refer to its [documentation page](https://quantumkithub.github.io/MatrixAlgebraKit.jl/stable/user_interface/decompositions/).
+
+Finally, note that each of the factorizations takes the current partition of `domain` and `codomain` as the *axis* along which to matricize and perform the factorization.
+In order to obtain factorizations according to a different bipartition of the indices, we can use any of the previously mentioned [index manipulations](@ref ss_indexmanipulation) before the factorization.
+
+Some examples to conclude this section
+```@repl tensors
+V1 = SU₂Space(0 => 2, 1/2 => 1)
+V2 = SU₂Space(0 => 1, 1/2 => 1, 1 => 1)
+
+t = randn(V1 ⊗ V1, V2);
+U, S, Vh = svd_compact(t);
+t ≈ U * S * Vh
+D, V = eigh_full(t' * t);
+D ≈ S * S
+U' * U ≈ id(domain(U))
+S
+
+Q, R = left_orth(t; alg = :svd);
+Q' * Q ≈ id(domain(Q))
+t ≈ Q * R
+
+U2, S2, Vh2, ε = svd_trunc(t; trunc = truncspace(V1));
+Vh2 * Vh2' ≈ id(codomain(Vh2))
+S2
+ε ≈ norm(block(S, Irrep[SU₂](1))) * sqrt(dim(Irrep[SU₂](1)))
+
+L, Q = right_orth(permute(t, ((1,), (2, 3))));
+codomain(L), domain(L), domain(Q)
+Q * Q'
+P = Q' * Q;
+P ≈ P * P
+t′ = permute(t, ((1,), (2, 3)));
+t′ ≈ t′ * P
+```
+
+## [Bosonic tensor contractions and tensor networks](@id ss_tensor_contraction)
+
+One of the most important operation with tensor maps is to compose them, more generally known as contracting them.
+As mentioned in the section on [category theory](@ref s_categories), a typical composition of maps in a ribbon category can graphically be represented as a planar arrangement of the morphisms (i.e. tensor maps, boxes with lines eminating from top and bottom, corresponding to source and target, i.e. domain and codomain), where the lines connecting the source and targets of the different morphisms should be thought of as ribbons, that can braid over or underneath each other, and that can twist.
+Technically, we can embed this diagram in ``ℝ × [0,1]`` and attach all the unconnected line endings corresponding objects in the source at some position ``(x,0)`` for ``x∈ℝ``, and all line endings corresponding to objects in the target at some position ``(x,1)``.
+The resulting morphism is then invariant under what is known as *framed three-dimensional isotopy*, i.e. three-dimensional rearrangements of the morphism that respect the rules of boxes connected by ribbons whose open endings are kept fixed.
+Such a two-dimensional diagram cannot easily be encoded in a single line of code.
+
+However, things simplify when the braiding is symmetric (such that over- and under- crossings become equivalent, i.e. just crossings), and when twists, i.e. self-crossings in this case, are trivial.
+This amounts to `BraidingStyle(I) == Bosonic()` in the language of TensorKit.jl, and is true for any subcategory of ``\mathbf{Vect}``, i.e. ordinary tensors, possibly with some symmetry constraint.
+The case of ``\mathbf{SVect}`` and its subcategories, and more general categories, are discussed below.
+
+In the case of trivial twists, we can deform the diagram such that we first combine every morphism with a number of coevaluations ``η`` so as to represent it as a tensor, i.e. with a trivial domain.
+We can then rearrange the morphism to be all ligned up horizontally, where the original morphism compositions are now being performed by evaluations ``ϵ``.
+This process will generate a number of crossings and twists, where the latter can be omitted because they act trivially.
+Similarly, double crossings can also be omitted.
+As a consequence, the diagram, or the morphism it represents, is completely specified by the tensors it is composed of, and which indices between the different tensors are connect, via the evaluation ``ϵ``, and which indices make up the source and target of the resulting morphism.
+If we also compose the resulting morphisms with coevaluations so that it has a trivial domain, we just have one type of unconnected lines, henceforth called open indices.
+We sketch such a rearrangement in the following picture
+
+```@raw html
+<img src="../img/tensor-bosoniccontraction.svg" alt="tensor unitary" class="color-invertible"/>
+```
+
+Hence, we can now specify such a tensor diagram, henceforth called a tensor contraction or also tensor network, using a one-dimensional syntax that mimicks [abstract index notation](https://en.wikipedia.org/wiki/Abstract_index_notation) and specifies which indices are connected by the evaluation map using Einstein's summation conventation.
+Indeed, for `BraidingStyle(I) == Bosonic()`, such a tensor contraction can take the same format as if all tensors were just multi-dimensional arrays.
+For this, we rely on the interface provided by the package [TensorOperations.jl](https://github.com/QuantumKitHub/TensorOperations.jl).
+
+The above picture would be encoded as
+```julia
+@tensor E[a, b, c, d, e] := A[v, w, d, x] * B[y, z, c, x] * C[v, e, y, b] * D[a, w, z]
+```
+or
+```julia
+@tensor E[:] := A[1, 2, -4, 3] * B[4, 5, -3, 3] * C[1, -5, 4, -2] * D[-1, 2, 5]
+```
+where the latter syntax is known as NCON-style, and labels the unconnected or outgoing indices with negative integers, and the contracted indices with positive integers.
+
+A number of remarks are in order.
+TensorOperations.jl accepts both integers and any valid variable name as dummy label for indices, and everything in between `[ ]` is not resolved in the current context but interpreted as a dummy label.
+Here, we label the indices of a `TensorMap`, like `A::TensorMap{T, S, N₁, N₂}`, in a linear fashion, where the first position corresponds to the first space in `codomain(A)`, and so forth, up to position `N₁`.
+Index `N₁ + 1` then corresponds to the first space in `domain(A)`.
+However, because we have applied the coevaluation ``η``, it actually corresponds to the corresponding dual space, in accordance with the interface of [`space(A, i)`](@ref) that we introduced [above](@ref ss_tensor_properties), and as indiated by the dotted box around ``A`` in the above picture.
+The same holds for the other tensor maps.
+Note that our convention also requires that we braid indices that we brought from the domain to the codomain, and so this is only unambiguous for a symmetric braiding, where there is a unique way to permute the indices.
+
+With the current syntax, we create a new object `E` because we use the definition operator `:=`.
+Furthermore, with the current syntax, it will be a `Tensor`, i.e. it will have a trivial domain, and correspond to the dotted box in the picture above, rather than the actual morphism `E`.
+We can also directly define `E` with the correct codomain and domain by rather using
+```julia
+@tensor E[a b c;d e] := A[v, w, d, x] * B[y, z, c, x] * C[v, e, y, b] * D[a, w, z]
+```
+or
+```julia
+@tensor E[(a, b, c);(d, e)] := A[v, w, d, x] * B[y, z, c, x] * C[v, e, y, b] * D[a, w, z]
+```
+where the latter syntax can also be used when the codomain is empty.
+When using the assignment operator `=`, the `TensorMap` `E` is assumed to exist and the contents will be written to the currently allocated memory.
+Note that for existing tensors, both on the left hand side and right hand side, trying to specify the indices in the domain and the codomain seperately using the above syntax, has no effect, as the bipartition of indices are already fixed by the existing object.
+Hence, if `E` has been created by the previous line of code, all of the following lines are now equivalent
+```julia
+@tensor E[(a, b, c);(d, e)] = A[v, w, d, x] * B[y, z, c, x] * C[v, e, y, b] * D[a, w, z]
+@tensor E[a, b, c, d, e] = A[v w d; x] * B[(y, z, c); (x, )] * C[v e y; b] * D[a, w, z]
+@tensor E[a b; c d e] = A[v; w d x] * B[y, z, c, x] * C[v, e, y, b] * D[a w; z]
+```
+and none of those will or can change the partition of the indices of `E` into its codomain and its domain.
+
+Two final remarks are in order.
+Firstly, the order of the tensors appearing on the right hand side is irrelevant, as we can reorder them by using the allowed moves of the Penrose graphical calculus, which yields some crossings and a twist.
+As the latter is trivial, it can be omitted, and we just use the same rules to evaluate the newly ordered tensor network.
+For the particular case of matrix-matrix multiplication, which also captures more general settings by appropriotely combining spaces into a single line, we indeed find
+
+```@raw html
+<img src="../img/tensor-contractionreorder.svg" alt="tensor contraction reorder" class="color-invertible"/>
+```
+
+or thus, the following two lines of code yield the same result
+```julia
+@tensor C[i, j] := B[i, k] * A[k, j]
+@tensor C[i, j] := A[k, j] * B[i, k]
+```
+Reordering of tensors can be used internally by the `@tensor` macro to evaluate the contraction in a more efficient manner.
+In particular, the NCON-style of specifying the contraction gives the user control over the order, and there are other macros, such as `@tensoropt`, that try to automate this process.
+There is also an `@ncon` macro and `ncon` function, an we recommend reading the [manual of TensorOperations.jl](https://quantumkithub.github.io/TensorOperations.jl/stable/) to learn more about the possibilities and how they work.
+
+A final remark involves the use of adjoints of tensors.
+The current framework is such that the user should not be too worried about the actual bipartition into codomain and domain of a given `TensorMap` instance.
+Indeed, for tensor contractions the `@tensor` macro figures out the correct manipulations automatically.
+However, when wanting to use the `adjoint` of an instance `t::TensorMap{T, S, N₁, N₂}`, the resulting `adjoint(t)` is an `AbstractTensorMap{T, S, N₂, N₁}` and one needs to know the values of `N₁` and `N₂` to know exactly where the `i`th index of `t` will end up in `adjoint(t)`, and hence the index order of `t'`.
+Within the `@tensor` macro, one can instead use `conj()` on the whole index expression so as to be able to use the original index ordering of `t`.
+For example, for `TensorMap{T, S, 1, 1}` instances, this yields exactly the equivalence one expects, namely one between the following two expressions:
+
+```julia
+@tensor C[i, j] := B'[i, k] * A[k, j]
+@tensor C[i, j] := conj(B[k, i]) * A[k, j]
+```
+
+For e.g. an instance `A::TensorMap{T, S, 3, 2}`, the following two syntaxes have the same effect within an `@tensor` expression: `conj(A[a, b, c, d, e])` and `A'[d, e, a, b, c]`.
+
+Some examples:
+
+## Fermionic tensor contractions
+
+TODO
+
+## Anyonic tensor contractions
+
+TODO
diff --git a/docs/src/man/tensors.md b/docs/src/man/tensors.md
index 155dbbe4d..2921e5f1b 100644
--- a/docs/src/man/tensors.md
+++ b/docs/src/man/tensors.md
@@ -5,7 +5,7 @@ using TensorKit
 using LinearAlgebra
 ```
 
-This page explains how to construct and access tensors in TensorKit.jl.
+This last page explains how to create and manipulate tensors in TensorKit.jl.
 As this is probably the most important part of the manual, we will also focus more strongly on the usage and interface, and less so on the underlying implementation.
 The only aspect of the implementation that we will address is the storage of the tensor data, as this is important to know how to create and initialize a tensor, but will in fact also shed light on how some of the methods work.