From 25876de797a71200c4aaff86a0e01315f22bbcfd Mon Sep 17 00:00:00 2001 From: Sergei Isakov Date: Tue, 16 Dec 2025 17:47:10 +0100 Subject: [PATCH 1/3] Add support for cuStateVecEx. --- CMakeLists.txt | 1 + Makefile | 15 + apps/Makefile | 6 + apps/make.sh | 3 +- apps/qsim_base_custatevecex.cu | 161 +++++ docs/cirq_interface.md | 8 +- lib/BUILD | 74 +++ lib/io.h | 12 + lib/io_file.h | 4 + lib/multiprocess_custatevecex.h | 213 ++++++ lib/run_custatevecex.h | 312 +++++++++ lib/simulator_custatevecex.h | 237 +++++++ lib/statespace_custatevecex.h | 431 ++++++++++++ lib/util_cuda.h | 19 +- lib/util_custatevec.h | 4 +- lib/util_custatevecex.h | 46 ++ lib/vectorspace_custatevecex.h | 611 ++++++++++++++++++ pybind_interface/Makefile | 10 +- pybind_interface/cuda/CMakeLists.txt | 2 +- pybind_interface/custatevec/CMakeLists.txt | 2 +- pybind_interface/custatevecex/CMakeLists.txt | 59 ++ .../custatevecex/pybind_main_custatevecex.cpp | 74 +++ .../custatevecex/pybind_main_custatevecex.h | 17 + pybind_interface/decide/CMakeLists.txt | 5 +- pybind_interface/decide/decide.cpp | 20 +- pybind_interface/hip/CMakeLists.txt | 2 +- qsimcirq/__init__.py | 12 +- qsimcirq/qsim_simulator.py | 25 +- qsimcirq_tests/qsimcirq_test.py | 109 ++++ setup.py | 1 + tests/Makefile | 17 + tests/hybrid_custatevecex_test.cu | 59 ++ tests/qtrajectory_custatevecex_test.cu | 88 +++ tests/run_custatevecex_test.cu | 262 ++++++++ tests/simulator_custatevecex_test.cu | 105 +++ tests/simulator_testfixture.h | 35 +- tests/statespace_custatevecex_test.cu | 119 ++++ 37 files changed, 3141 insertions(+), 39 deletions(-) create mode 100644 apps/qsim_base_custatevecex.cu create mode 100644 lib/multiprocess_custatevecex.h create mode 100644 lib/run_custatevecex.h create mode 100644 lib/simulator_custatevecex.h create mode 100644 lib/statespace_custatevecex.h create mode 100644 lib/util_custatevecex.h create mode 100644 lib/vectorspace_custatevecex.h create mode 100644 pybind_interface/custatevecex/CMakeLists.txt create mode 100644 pybind_interface/custatevecex/pybind_main_custatevecex.cpp create mode 100644 pybind_interface/custatevecex/pybind_main_custatevecex.h create mode 100644 tests/hybrid_custatevecex_test.cu create mode 100644 tests/qtrajectory_custatevecex_test.cu create mode 100644 tests/run_custatevecex_test.cu create mode 100644 tests/simulator_custatevecex_test.cu create mode 100644 tests/statespace_custatevecex_test.cu diff --git a/CMakeLists.txt b/CMakeLists.txt index e8a92a47b..8b824c81d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -64,6 +64,7 @@ if(NOT CMAKE_APPLE_SILICON_PROCESSOR) add_subdirectory(pybind_interface/cuda) if(DEFINED ENV{CUQUANTUM_ROOT}) add_subdirectory(pybind_interface/custatevec) + add_subdirectory(pybind_interface/custatevecex) endif() elseif(has_hipcc) add_subdirectory(pybind_interface/hip) diff --git a/Makefile b/Makefile index b4e16da06..94a9bbcf9 100644 --- a/Makefile +++ b/Makefile @@ -94,7 +94,10 @@ ifneq (,$(strip $(CUQUANTUM_ROOT))) CUSVFLAGS += -lcustatevec -lcublas CUSTATEVECFLAGS ?= $(CUSVFLAGS) TARGETS += qsim-custatevec + TARGETS += qsim-custatevecex TESTS += run-custatevec-tests + TESTS += run-custatevecex-tests + TESTS += run-custatevecex-mpi-tests else $(warning $$CUQUANTUM_ROOT is set, but the path does not seem to exist) endif @@ -140,6 +143,10 @@ cuda-tests: custatevec-tests: | check-cuquantum-root-set $(MAKE) -C tests/ custatevec-tests +.PHONY: custatevecex-tests +custatevecex-tests: | check-cuquantum-root-set + $(MAKE) -C tests/ custatevecex-tests + .PHONY: hip-tests hip-tests: $(MAKE) -C tests/ hip-tests @@ -156,6 +163,14 @@ run-cuda-tests: cuda-tests run-custatevec-tests: custatevec-tests $(MAKE) -C tests/ run-custatevec-tests +.PHONY: run-custatevecex-tests +run-custatevecex-tests: custatevecex-tests + $(MAKE) -C tests/ run-custatevecex-tests + +.PHONY: run-custatevecex-mpi-tests +run-custatevecex-mpi-tests: custatevecex-tests + $(MAKE) -C tests/ run-custatevecex-mpi-tests + .PHONY: run-hip-tests run-hip-tests: hip-tests $(MAKE) -C tests/ run-hip-tests diff --git a/apps/Makefile b/apps/Makefile index 48b25cabd..fd18635fc 100644 --- a/apps/Makefile +++ b/apps/Makefile @@ -7,6 +7,9 @@ CUDA_TARGETS := $(CUDA_TARGETS:%cuda.cu=%cuda.x) CUSTATEVEC_TARGETS = $(shell find . -maxdepth 1 -name "*custatevec.cu") CUSTATEVEC_TARGETS := $(CUSTATEVEC_TARGETS:%custatevec.cu=%custatevec.x) +CUSTATEVEC_TARGETS = $(shell find . -maxdepth 1 -name "*custatevecex.cu") +CUSTATEVEC_TARGETS := $(CUSTATEVEC_TARGETS:%custatevec.cu=%custatevecex.x) + HIP_TARGETS = $(shell find . -maxdepth 1 -name '*cuda.cu') HIP_TARGETS := $(HIP_TARGETS:%cuda.cu=%hip.x) @@ -31,6 +34,9 @@ qsim-hip: $(HIP_TARGETS) %custatevec.x: %custatevec.cu $(NVCC) -o ./$@ $< $(NVCCFLAGS) $(CUSTATEVECFLAGS) +%custatevecex.x: %custatevecex.cu + $(NVCC) -o ./$@ $< $(NVCCFLAGS) $(CUSTATEVECFLAGS) + %hip.x: %cuda.cu $(HIPCC) -o ./$@ $< $(HIPCCFLAGS) diff --git a/apps/make.sh b/apps/make.sh index 610b2eb4f..7ebf00c7b 100755 --- a/apps/make.sh +++ b/apps/make.sh @@ -37,7 +37,8 @@ if command -v nvcc &>/dev/null; then ) nvcc -O3 "${CUSTATEVECFLAGS[@]}" \ -o qsim_base_custatevec.x qsim_base_custatevec.cu - + nvcc -O3 "${CUSTATEVECFLAGS[@]}" \ + -o qsim_base_custatevecex.x qsim_base_custatevecex.cu fi elif command -v hipcc &>/dev/null; then hipcc -O3 -o qsim_base_hip.x qsim_base_cuda.cu diff --git a/apps/qsim_base_custatevecex.cu b/apps/qsim_base_custatevecex.cu new file mode 100644 index 000000000..8cc12fb80 --- /dev/null +++ b/apps/qsim_base_custatevecex.cu @@ -0,0 +1,161 @@ +// Copyright 2025 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include +#include + +#include "../lib/circuit_qsim_parser.h" +#include "../lib/formux.h" +#include "../lib/gates_qsim.h" +#include "../lib/io_file.h" +#include "../lib/multiprocess_custatevecex.h" +#include "../lib/run_custatevecex.h" +#include "../lib/simulator_custatevecex.h" +#include "../lib/util_custatevec.h" + +struct Options { + std::string circuit_file; + unsigned maxtime = std::numeric_limits::max(); + unsigned seed = 1; + unsigned verbosity = 0; +}; + +Options GetOptions(int argc, char* argv[]) { + constexpr char usage[] = "usage:\n ./qsim_base -c circuit -d maxtime " + "-s seed -v verbosity\n"; + + Options opt; + + int k; + + while ((k = getopt(argc, argv, "c:d:s:v:")) != -1) { + switch (k) { + case 'c': + opt.circuit_file = optarg; + break; + case 'd': + opt.maxtime = std::atoi(optarg); + break; + case 's': + opt.seed = std::atoi(optarg); + break; + break; + case 'v': + opt.verbosity = std::atoi(optarg); + break; + default: + qsim::IO::errorf(usage); + exit(1); + } + } + + return opt; +} + +bool ValidateOptions(const Options& opt) { + if (opt.circuit_file.empty()) { + qsim::IO::errorf("circuit file is not provided.\n"); + return false; + } + + return true; +} + +template +void PrintAmplitudes( + unsigned num_qubits, const StateSpace& state_space, const State& state) { + static constexpr char const* bits[8] = { + "000", "001", "010", "011", "100", "101", "110", "111", + }; + + uint64_t size = std::min(uint64_t{8}, uint64_t{1} << num_qubits); + unsigned s = 3 - std::min(unsigned{3}, num_qubits); + + for (uint64_t i = 0; i < size; ++i) { + auto a = state_space.GetAmpl(state, i); + qsim::IO::messagef("%s:%16.8g%16.8g%16.8g\n", + bits[i] + s, std::real(a), std::imag(a), std::norm(a)); + } +} + +int main(int argc, char* argv[]) { + using namespace qsim; + + auto opt = GetOptions(argc, argv); + if (!ValidateOptions(opt)) { + return 1; + } + + using fp_type = float; + + Circuit> circuit; + if (!CircuitQsimParser::FromFile(opt.maxtime, opt.circuit_file, + circuit)) { + return 1; + } + + struct Factory { + using Simulator = qsim::SimulatorCuStateVecEx; + using StateSpace = Simulator::StateSpace; + + explicit Factory(unsigned verbosity = 0) : verbosity(verbosity) { + mp.initialize(); + } + + StateSpace CreateStateSpace() const { + StateSpace::Parameter param; + param.verbosity = verbosity; + + return StateSpace{mp, param}; + } + + Simulator CreateSimulator() const { + return Simulator{}; + } + + MultiProcessCuStateVecEx mp; + unsigned verbosity; + }; + + using Simulator = Factory::Simulator; + using StateSpace = Simulator::StateSpace; + using State = StateSpace::State; + using Runner = CuStateVecExRunner; + + Factory factory(opt.verbosity); + + StateSpace state_space = factory.CreateStateSpace(); + State state = state_space.Create(circuit.num_qubits); + + if (state_space.IsNull(state)) { + IO::errorf("not enough memory: is the number of qubits too large?\n"); + return 1; + } + + state_space.SetStateZero(state); + + Runner::Parameter param; + param.seed = opt.seed; + param.verbosity = opt.verbosity; + + if (Runner::Run(param, factory, circuit, state)) { + PrintAmplitudes(circuit.num_qubits, state_space, state); + } + + return 0; +} diff --git a/docs/cirq_interface.md b/docs/cirq_interface.md index 593da1700..5dd1ddb4f 100644 --- a/docs/cirq_interface.md +++ b/docs/cirq_interface.md @@ -186,8 +186,11 @@ library. `QSimOptions` provides five parameters to configure GPU execution. `use_gpu` is required to enable GPU execution: * `use_gpu`: if True, use GPU instead of CPU for simulation. -* `gpu_mode`: use CUDA if set to 0 (default value) or use the NVIDIA cuStateVec -library if set to any other value. +* `gpu_mode`: use CUDA if set to 0 (default value), use the NVIDIA cuStateVec +if set to 1 or use the NVIDIA cuStateVecEx library if set to any other value. + +In the case of the NVIDIA cuStateVecEx library, simulations can be performed +in multi-device / multi-node environments. If `use_gpu` is set and `gpu_mode` is set to 0, the remaining parameters can optionally be set to fine-tune StateSpace performance for a specific device. @@ -196,3 +199,4 @@ In most cases, the default values provide good performance. StateSpace. This must be a power of 2 in the range [32, 1024]. * `gpu_data_blocks`: number of data blocks to use for the GPU StateSpace. Below 16 data blocks, performance is noticeably reduced. + diff --git a/lib/BUILD b/lib/BUILD index 02aa71bb0..50ef51491 100644 --- a/lib/BUILD +++ b/lib/BUILD @@ -186,8 +186,10 @@ cuda_library( "matrix.h", "mps_simulator.h", "mps_statespace.h", + "multiprocess_custatevecex.h", "parfor.h", "qtrajectory.h", + "run_custatevecex.h", "run_qsim.h", "run_qsimh.h", "seqfor.h", @@ -198,12 +200,14 @@ cuda_library( "simulator_avx512.h", "simulator_basic.h", "simulator_custatevec.h", + "simulator_custatevecex.h", "simulator_sse.h", "statespace.h", "statespace_avx.h", "statespace_avx512.h", "statespace_basic.h", "statespace_custatevec.h", + "statespace_custatevecex.h", "statespace_sse.h", "umux.h", "unitary_calculator_avx.h", @@ -219,8 +223,10 @@ cuda_library( "util_cpu.h", "util_cuda.h", "util_custatevec.h", + "util_custatevecex.h", "vectorspace.h", "vectorspace_cuda.h", + "vectorspace_custatevecex.h", ], copts = ["-D__CUSTATEVEC__"], deps = [ @@ -357,6 +363,11 @@ cuda_library( hdrs = ["util_custatevec.h"], ) +cuda_library( + name = "util_custatevecex", + hdrs = ["util_custatevecex.h"], +) + ### Input/output libraries ### cc_library( @@ -506,6 +517,29 @@ cc_library( ], ) +cuda_library( + name = "run_custatevecex", + hdrs = ["run_custatevecex.h"], + deps = [ + ":circuit", + ":util", + ":util_custatevec", + ":util_custatevecex", + ], +) + +### Multi-process library ### + +cuda_library( + name = "multiprocess_custatevecex", + hdrs = ["multiprocess_custatevecex.h"], + deps = [ + ":io", + ":util_custatevec", + ":util_custatevecex", + ], +) + ### Vectorspace libraries ### cc_library( @@ -518,6 +552,19 @@ cuda_library( hdrs = ["vectorspace_cuda.h"], ) +cuda_library( + name = "vectorspace_custatevecex", + hdrs = ["vectorspace_custatevecex.h"], + deps = [ + "io", + ":multiprocess_custatevecex", + ":util_cuda", + ":util_custatevec", + ":util_custatevecex", + ":vectorspace_custatevecex", + ], +) + ### Statespace libraries ### cc_library( @@ -591,6 +638,20 @@ cuda_library( ], ) +cuda_library( + name = "statespace_custatevecex", + hdrs = [ + "statespace_custatevecex.h", + ], + deps = [ + ":multiprocess_custatevecex", + ":statespace", + ":util_custatevec", + ":util_custatevecex", + ":vectorspace_custatevecex", + ], +) + ### Simulator libraries ### cc_library( @@ -660,6 +721,19 @@ cuda_library( ], ) +cuda_library( + name = "simulator_custatevecex", + hdrs = [ + "simulator_custatevecex.h", + ], + deps = [ + ":io", + ":statespace_custatevecex", + ":util_custatevec", + ":util_custatevecex", + ], +) + # All three state-vector simulators with multiplexer cc_library( name = "simulator", diff --git a/lib/io.h b/lib/io.h index 3b26c7cc6..97de5fc12 100644 --- a/lib/io.h +++ b/lib/io.h @@ -20,11 +20,19 @@ namespace qsim { +namespace output { + static bool enabled = true; +} + /** * Controller for output logs. */ struct IO { static void errorf(const char* format, ...) { + if (!output::enabled) { + return; + } + va_list args; va_start(args, format); vfprintf(stderr, format, args); @@ -32,6 +40,10 @@ struct IO { } static void messagef(const char* format, ...) { + if (!output::enabled) { + return; + } + va_list args; va_start(args, format); vprintf(format, args); diff --git a/lib/io_file.h b/lib/io_file.h index 3cfac12db..789efbd60 100644 --- a/lib/io_file.h +++ b/lib/io_file.h @@ -47,6 +47,10 @@ struct IOFile : public IO { static bool WriteToFile( const std::string& file, const void* data, uint64_t size) { + if (!output::enabled) { + return true; + } + auto fs = std::fstream(file, std::ios::out | std::ios::binary); if (!fs) { diff --git a/lib/multiprocess_custatevecex.h b/lib/multiprocess_custatevecex.h new file mode 100644 index 000000000..001498aac --- /dev/null +++ b/lib/multiprocess_custatevecex.h @@ -0,0 +1,213 @@ +// Copyright 2025 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MULTIPROCESS_CUSTATEVECEX_H_ +#define MULTIPROCESS_CUSTATEVECEX_H_ + +#include +#include + +#include +#include + +#include +#include + +#include "io.h" +#include "util_custatevec.h" +#include "util_custatevecex.h" + +namespace qsim { + +struct MultiProcessCuStateVecEx { + enum NetworkType { + kSuperPod = 0, + kGB200NVL = 1, + kSwitchTree = 2, + kCommunicator = 3, + }; + + struct Parameter { + uint64_t transfer_buffer_size = 16777216; + NetworkType network_type = kSuperPod; + }; + + MultiProcessCuStateVecEx(Parameter param = Parameter{16777216, kSuperPod}) + : param_(param), communicator_(nullptr), initialized_(false) {} + + ~MultiProcessCuStateVecEx() { + if (communicator_) { + custatevecExCommunicatorDestroy(communicator_); + } + + custatevecExCommunicatorStatus_t status; + custatevecExCommunicatorFinalize(&status); + } + + custatevecExCommunicatorDescriptor_t communicator() const { + return communicator_; + } + + unsigned num_processes() const { + return num_processes_; + } + + unsigned rank() const { + return rank_; + } + + bool initialized() const { + return initialized_; + } + + void initialize() { + int argc = 0; + char** argv = nullptr; + + auto comm_type = CUSTATEVEC_COMMUNICATOR_TYPE_OPENMPI; + + custatevecExCommunicatorStatus_t comm_status; + auto status = custatevecExCommunicatorInitialize( + comm_type, nullptr, &argc, &argv, &comm_status); + + if (status != CUSTATEVEC_STATUS_SUCCESS || + comm_status != CUSTATEVEC_EX_COMMUNICATOR_STATUS_SUCCESS) { + return; + } + + communicator_ = nullptr; + status = custatevecExCommunicatorCreate(&communicator_); + + if (status != CUSTATEVEC_STATUS_SUCCESS) { + return; + } + + int num_processes, rank; + ErrorCheck(communicator_->intf->getSize(communicator_, &num_processes)); + ErrorCheck(communicator_->intf->getRank(communicator_, &rank)); + + ErrorCheck(communicator_->intf->getRank(communicator_, &rank)); + if (rank != 0) { + output::enabled = false; + } + + if (num_processes < 2 || (num_processes & (num_processes - 1)) != 0) { + return; + } + + num_global_qubits_ = get_num_global_qubits(num_processes); + + unsigned num_acc_global_qubits = 0; + auto network_layers = get_network_layers(param_.network_type); + + num_global_qubits_per_layer_.reserve(2); + global_index_bit_classes_.reserve(2); + + for (const auto& layer : network_layers) { + auto k = num_global_qubits_ - num_acc_global_qubits; + global_index_bit_classes_.push_back(layer.global_index_bit_class); + + if (layer.num_global_qubits == 0 || k <= layer.num_global_qubits) { + num_global_qubits_per_layer_.push_back(k); + num_acc_global_qubits = num_global_qubits_; + break; + } + + num_global_qubits_per_layer_.push_back(layer.num_global_qubits); + num_acc_global_qubits += layer.num_global_qubits; + } + + if (num_acc_global_qubits < num_global_qubits_) { + IO::errorf("erorr: too few network layers at %s %d.\n", + __FILE__, __LINE__); + exit(1); + } + + memory_sharing_method_ = CUSTATEVEC_EX_MEMORY_SHARING_METHOD_NONE; + + for (const auto& layer : network_layers) { + if (layer.global_index_bit_class == + CUSTATEVEC_EX_GLOBAL_INDEX_BIT_CLASS_INTERPROC_P2P) { + memory_sharing_method_ = CUSTATEVEC_EX_MEMORY_SHARING_METHOD_AUTODETECT; + break; + } + } + + num_processes_ = num_processes; + rank_ = rank; + initialized_ = true; + } + + auto create_sv_config(unsigned num_qubits, cudaDataType_t data_type) const { + custatevecExDictionaryDescriptor_t sv_config = nullptr; + + if (!initialized_ || + num_qubits < 3 || num_global_qubits_ + 2 > num_qubits) { + return sv_config; + } + + unsigned num_local_qubits = num_qubits - num_global_qubits_; + + ErrorCheck(custatevecExConfigureStateVectorMultiProcess( + &sv_config, data_type, num_qubits, num_local_qubits, -1, + memory_sharing_method_, global_index_bit_classes_.data(), + (int32_t*) num_global_qubits_per_layer_.data(), + (int32_t) global_index_bit_classes_.size(), + param_.transfer_buffer_size, nullptr, 0)); + + return sv_config; + } + + private: + Parameter param_; + custatevecExCommunicatorDescriptor_t communicator_; + std::vector num_global_qubits_per_layer_; + std::vector global_index_bit_classes_; + custatevecExMemorySharingMethod_t memory_sharing_method_; + unsigned num_processes_; + unsigned num_global_qubits_; + unsigned rank_; + bool initialized_; + + struct NetworkLayer { + custatevecExGlobalIndexBitClass_t global_index_bit_class; + unsigned num_global_qubits; + }; + + using NetworkLayers = std::vector; + + static NetworkLayers get_network_layers(NetworkType id) { + switch (id) { + case kSuperPod: + return {{CUSTATEVEC_EX_GLOBAL_INDEX_BIT_CLASS_INTERPROC_P2P, 3}, + {CUSTATEVEC_EX_GLOBAL_INDEX_BIT_CLASS_COMMUNICATOR, 0}}; + case kGB200NVL: + return {{CUSTATEVEC_EX_GLOBAL_INDEX_BIT_CLASS_INTERPROC_P2P, 0}}; + break; + case kSwitchTree: + return {{CUSTATEVEC_EX_GLOBAL_INDEX_BIT_CLASS_INTERPROC_P2P, 2}, + {CUSTATEVEC_EX_GLOBAL_INDEX_BIT_CLASS_INTERPROC_P2P, 1}}; + break; + case kCommunicator: + return {{CUSTATEVEC_EX_GLOBAL_INDEX_BIT_CLASS_COMMUNICATOR, 0}}; + break; + } + + return NetworkLayers{}; + } +}; + +} // namespace qsim + +#endif // MULTIPROCESS_CUSTATEVECEX_H_ diff --git a/lib/run_custatevecex.h b/lib/run_custatevecex.h new file mode 100644 index 000000000..390f11013 --- /dev/null +++ b/lib/run_custatevecex.h @@ -0,0 +1,312 @@ +// Copyright 2025 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef RUN_CUSTATEVECEX_H_ +#define RUN_CUSTATEVECEX_H_ + +#include +#include +#include + +#include + +#include "circuit.h" +#include "util.h" +#include "util_custatevec.h" +#include "util_custatevecex.h" + +namespace qsim { + +/** + * Helper struct for running qsim with the cuStateVecEx library. + */ +template +struct CuStateVecExRunner final { + public: + using Simulator = typename Factory::Simulator; + using StateSpace = typename Simulator::StateSpace; + using State = typename StateSpace::State; + using MeasurementResult = typename StateSpace::MeasurementResult; + + /** + * User-specified parameters for simulation. + */ + struct Parameter { + /** + * Random number generator seed to apply measurement gates. + */ + uint64_t seed; + + unsigned verbosity = 0; + }; + + /** + * Runs the given circuit, only measuring at the end. + * @param param Options for gate fusion, parallelism and logging. + * @param factory Object to create simulators and state spaces. + * @param circuit The circuit to be simulated. + * @param measure Function that performs measurements (in the sense of + * computing expectation values, etc). + * @return True if the simulation completed successfully; false otherwise. + */ + template + static bool Run(const Parameter& param, const Factory& factory, + const Circuit& circuit, MeasurementFunc measure) { + return Run(param, factory, {circuit.gates.back().time}, circuit, measure); + } + + /** + * Runs the given circuit, measuring at user-specified times. + * @param param Options for gate fusion, parallelism and logging. + * @param factory Object to create simulators and state spaces. + * @param times_to_measure_at Time steps at which to perform measurements. + * @param circuit The circuit to be simulated. + * @param measure Function that performs measurements (in the sense of + * computing expectation values, etc). + * @return True if the simulation completed successfully; false otherwise. + */ + template + static bool Run(const Parameter& param, const Factory& factory, + const std::vector& times_to_measure_at, + const Circuit& circuit, MeasurementFunc measure) { + std::vector discarded_results; + + StateSpace state_space = factory.CreateStateSpace(); + Simulator simulator = factory.CreateSimulator(); + + auto state = state_space.Create(circuit.num_qubits); + if (state_space.IsNull(state)) { + IO::errorf("not enough memory: is the number of qubits too large?\n"); + return false; + } + + state_space.SetStateZero(state); + + return Run(param, circuit, state_space, simulator, state, + times_to_measure_at, measure, discarded_results); + } + + /** + * Runs the given circuit and make the final state available to the caller, + * recording the result of any intermediate measurements in the circuit. + * @param param Options for gate fusion, parallelism and logging. + * @param factory Object to create simulators and state spaces. + * @param circuit The circuit to be simulated. + * @param state As an input parameter, this should contain the initial state + * of the system. After a successful run, it will be populated with the + * final state of the system. + * @param measure_results As an input parameter, this should be empty. + * After a successful run, this will contain all measurements results from + * the run, ordered by time and qubit index. + * @return True if the simulation completed successfully; false otherwise. + */ + template + static bool Run(const Parameter& param, const Factory& factory, + const Circuit& circuit, State& state, + std::vector& measure_results) { + auto measure = [](unsigned, const StateSpace&, const State&) {}; + + StateSpace state_space = factory.CreateStateSpace(); + Simulator simulator = factory.CreateSimulator(); + + return Run(param, circuit, state_space, simulator, state, + {}, measure, measure_results); + } + + /** + * Runs the given circuit and make the final state available to the caller, + * discarding the result of any intermediate measurements in the circuit. + * @param param Options for gate fusion, parallelism and logging. + * @param factory Object to create simulators and state spaces. + * @param circuit The circuit to be simulated. + * @param state As an input parameter, this should contain the initial state + * of the system. After a successful run, it will be populated with the + * final state of the system. + * @return True if the simulation completed successfully; false otherwise. + */ + template + static bool Run(const Parameter& param, const Factory& factory, + const Circuit& circuit, State& state) { + auto measure = [](unsigned, const StateSpace&, const State&) {}; + + StateSpace state_space = factory.CreateStateSpace(); + Simulator simulator = factory.CreateSimulator(); + + std::vector discarded_results; + + return Run(param, circuit, state_space, simulator, state, + {}, measure, discarded_results); + } + + /** + * Runs the given circuit and make the final state available to the caller, + * recording the result of any intermediate measurements in the circuit. + * @param param Options for gate fusion, parallelism and logging. + * @param circuit The circuit to be simulated. + * @param state_space StateSpace object required to perform measurements. + * @param simulator Simulator object. Provides specific implementations for + * applying gates. + * @param state As an input parameter, this should contain the initial state + * of the system. After a successful run, it will be populated with the + * final state of the system. + * @param measure_results As an input parameter, this should be empty. + * After a successful run, this will contain all measurements results from + * the run, ordered by time and qubit index. + * @return True if the simulation completed successfully; false otherwise. + */ + template + static bool Run(const Parameter& param, const Circuit& circuit, + const StateSpace& state_space, const Simulator& simulator, + State& state, + std::vector& measure_results) { + auto measure = [](unsigned, const StateSpace&, const State&) {}; + + return Run(param, circuit, state_space, simulator, state, + {}, measure, measure_results); + } + + /** + * Runs the given circuit and make the final state available to the caller, + * discarding the result of any intermediate measurements in the circuit. + * @param param Options for gate fusion, parallelism and logging. + * @param circuit The circuit to be simulated. + * @param state_space StateSpace object required to perform measurements. + * @param simulator Simulator object. Provides specific implementations for + * applying gates. + * @param state As an input parameter, this should contain the initial state + * of the system. After a successful run, it will be populated with the + * final state of the system. + * @return True if the simulation completed successfully; false otherwise. + */ + template + static bool Run(const Parameter& param, const Circuit& circuit, + const StateSpace& state_space, const Simulator& simulator, + State& state) { + auto measure = [](unsigned, const StateSpace&, const State&) {}; + + std::vector discarded_results; + + return Run(param, circuit, state_space, simulator, state, + {}, measure, discarded_results); + } + + private: + template + static bool Run(const Parameter& param, const Circuit& circuit, + const StateSpace& state_space, const Simulator& simulator, + State& state, + const std::vector& times_to_measure_at, + MeasurementFunc measure, + std::vector& measure_results) { + double t0 = 0.0; + + RGen rgen(param.seed); + + custatevecExSVUpdaterDescriptor_t sv_updater = nullptr; + custatevecExDictionaryDescriptor_t sv_updater_config = nullptr; + + ErrorCheck(custatevecExConfigureSVUpdater( + &sv_updater_config, StateSpace::kStateDataType, nullptr, 0)); + + ErrorCheck( + custatevecExSVUpdaterCreate(&sv_updater, sv_updater_config, nullptr)); + ErrorCheck(custatevecExDictionaryDestroy(sv_updater_config)); + + if (param.verbosity > 0) { + t0 = GetTime(); + } + + unsigned cur_time_index = 0; + + using Gates = detail::Gates; + const auto& gates = Gates::get(circuit); + + for (std::size_t i = 0; i < gates.size(); ++i) { + const auto& gate = Gates::gate(gates[i]); + unsigned num_qubits = gate.qubits.size(); + unsigned num_cqubits = gate.controlled_by.size(); + + if (gate.kind == gate::kMeasurement) { + ErrorCheck( + custatevecExSVUpdaterApply(sv_updater, state.get(), nullptr, 0)); + ErrorCheck(custatevecExSVUpdaterClear(sv_updater)); + + auto measure_result = state_space.Measure(gate.qubits, rgen, state); + if (measure_result.valid) { + measure_results.push_back(std::move(measure_result)); + } else { + IO::errorf("measurement failed.\n"); + return false; + } + } else if (num_cqubits == 0) { + if (num_qubits == 0) { + ErrorCheck( + custatevecExSVUpdaterApply(sv_updater, state.get(), nullptr, 0)); + ErrorCheck(custatevecExSVUpdaterClear(sv_updater)); + + simulator.ApplyGate(gate.qubits, gate.matrix.data(), state); + } else { + ErrorCheck(custatevecExSVUpdaterEnqueueMatrix( + sv_updater, gate.matrix.data(), StateSpace::kMatrixDataType, + StateSpace::kExMatrixType, StateSpace::kMatrixLayout, 0, + (int32_t*) gate.qubits.data(), num_qubits, nullptr, nullptr, 0)); + } + } else { + std::vector control_bits; + control_bits.reserve(num_cqubits); + + for (std::size_t i = 0; i < num_cqubits; ++i) { + control_bits.push_back((gate.cmask >> i) & 1); + } + + ErrorCheck(custatevecExSVUpdaterEnqueueMatrix( + sv_updater, gate.matrix.data(), StateSpace::kMatrixDataType, + StateSpace::kExMatrixType, StateSpace::kMatrixLayout, 0, + (int32_t*) gate.qubits.data(), num_qubits, + (int32_t*) gate.controlled_by.data(), control_bits.data(), + num_cqubits)); + } + + if (times_to_measure_at.size() > 0) { + unsigned t = times_to_measure_at[cur_time_index]; + + if (i == gates.size() - 1 || t < Gates::gate(gates[i + 1]).time) { + ErrorCheck( + custatevecExSVUpdaterApply(sv_updater, state.get(), nullptr, 0)); + ErrorCheck(custatevecExSVUpdaterClear(sv_updater)); + + // Call back to perform measurements. + measure(cur_time_index, state_space, state); + ++cur_time_index; + } + } + } + + ErrorCheck(custatevecExSVUpdaterApply(sv_updater, state.get(), nullptr, 0)); + + if (param.verbosity > 0) { + state_space.DeviceSync(); + double t1 = GetTime(); + IO::messagef("simu time is %g seconds.\n", t1 - t0); + } + + ErrorCheck(custatevecExSVUpdaterDestroy(sv_updater)); + + return true; + } +}; + +} // namespace qsim + +#endif // RUN_CUSTATEVECEX_H_ diff --git a/lib/simulator_custatevecex.h b/lib/simulator_custatevecex.h new file mode 100644 index 000000000..cb4526ec8 --- /dev/null +++ b/lib/simulator_custatevecex.h @@ -0,0 +1,237 @@ +// Copyright 2025 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef SIMULATOR_CUSTATEVECEX_H_ +#define SIMULATOR_CUSTATEVECEX_H_ + +#include +#include +#include + +#include +#include + +#include "io.h" +#include "statespace_custatevecex.h" +#include "util_custatevec.h" +#include "util_custatevecex.h" + +namespace qsim { + +/** + * Quantum circuit simulator using the NVIDIA cuStateVec library. + */ +template +class SimulatorCuStateVecEx final { + public: + using StateSpace = StateSpaceCuStateVecEx; + using State = typename StateSpace::State; + using fp_type = typename StateSpace::fp_type; + + static constexpr auto kStateDataType = StateSpace::kStateDataType; + static constexpr auto kMatrixDataType = StateSpace::kMatrixDataType; + static constexpr auto kExMatrixType = StateSpace::kExMatrixType; + static constexpr auto kMatrixLayout = StateSpace::kMatrixLayout; + static constexpr auto kExpectDataType = CUDA_C_64F; + static constexpr auto kComputeType = + StateSpace::is_float ? CUSTATEVEC_COMPUTE_32F : CUSTATEVEC_COMPUTE_64F; + + SimulatorCuStateVecEx() {} + + /** + * Applies a gate using the NVIDIA cuStateVec library. + * @param qs Indices of the qubits affected by this gate. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyGate(const std::vector& qs, + const fp_type* matrix, State& state) const { + if (qs.size() == 0) { + StateSpace::Multiply(matrix[0], matrix[1], state); + } else { + unsigned num_qubits = state.num_qubits(); + unsigned num_global_qubits = get_num_global_qubits(state.num_substates()); + unsigned num_local_qubits = num_qubits - num_global_qubits; + + if (qs.size() > num_local_qubits) { + IO::errorf("error: the number of gate qubits exceeds the number of " + "local qubits at %s %d.\n", __FILE__, __LINE__); + exit(1); + } + + ErrorCheck(custatevecExApplyMatrix( + state.get(), matrix, kMatrixDataType, kExMatrixType, kMatrixLayout, + 0, (int32_t*) qs.data(), qs.size(), nullptr, nullptr, 0)); + } + } + + /** + * Applies a controlled gate using the NVIDIA cuStateVec library. + * @param qs Indices of the qubits affected by this gate. + * @param cqs Indices of control qubits. + * @param cmask Bit mask of control qubit values. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyControlledGate(const std::vector& qs, + const std::vector& cqs, uint64_t cmask, + const fp_type* matrix, State& state) const { + if (qs.size() == 0) { + IO::errorf( + "error: controlled global phase gate is not implemented %s %d.\n", + __FILE__, __LINE__); + exit(1); + } else { + unsigned num_qubits = state.num_qubits(); + unsigned num_global_qubits = get_num_global_qubits(state.num_substates()); + unsigned num_local_qubits = num_qubits - num_global_qubits; + + if (qs.size() > num_local_qubits) { + IO::errorf("error: the number of gate qubits exceeds the number of " + "local qubits at %s %d.\n", __FILE__, __LINE__); + exit(1); + } + + std::vector control_bits; + control_bits.reserve(cqs.size()); + + for (std::size_t i = 0; i < cqs.size(); ++i) { + control_bits.push_back((cmask >> i) & 1); + } + + ErrorCheck(custatevecExApplyMatrix( + state.get(), matrix, kMatrixDataType, kExMatrixType, kMatrixLayout, + 0, (int32_t*) qs.data(), qs.size(), (int32_t*) cqs.data(), + control_bits.data(), cqs.size())); + } + } + + /** + * Computes the expectation value of an operator using the NVIDIA cuStateVec + * library. + * @param qs Indices of the qubits the operator acts on. + * @param matrix The operator matrix. + * @param state The state of the system. + * @return The computed expectation value. + */ + std::complex ExpectationValue(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + unsigned num_qubits = state.num_qubits(); + unsigned num_global_qubits = get_num_global_qubits(state.num_substates()); + unsigned num_local_qubits = num_qubits - num_global_qubits; + + if (qs.size() > num_local_qubits) { + IO::errorf("error: the number of gate qubits exceeds the number of " + "local qubits at %s %d.\n", __FILE__, __LINE__); + exit(1); + } + + const auto& wire_ordering = state.get_wire_ordering(); + + // Wire ordering can be arbitrary. The following lines make qs consistent + // with wire ordering and permute bits if necessary. + + std::vector perm; + perm.reserve(num_qubits); + + for (unsigned i = 0; i < num_qubits; ++i) { + perm.push_back(i); + } + + unsigned l = 0; + std::vector qs2(qs.size()); + + for (unsigned k = 0; k < qs.size(); ++k) { + for (unsigned i = 0; i < num_qubits; ++i) { + if (qs[k] == (unsigned) wire_ordering[i]) { + qs2[k] = i; + break; + } + } + } + + for (unsigned k = 0; k < qs2.size(); ++k) { + if (qs2[k] >= num_local_qubits) { + unsigned j = 0; + while (j < qs2.size()) { + for (j = 0; j < qs2.size(); ++j) { + if (qs2[j] == l) { + ++l; + + if (l == num_local_qubits) { + // We should not get here. + IO::errorf("error: internal error at %s %d.\n", + __FILE__, __LINE__); + exit(1); + } + + break; + } + } + } + + std::swap(perm[qs2[k]], perm[l]); + qs2[k] = l++; + } + } + + if (l > 0) { + ErrorCheck(custatevecExStateVectorPermuteIndexBits( + state.get(), (int32_t*) perm.data(), num_qubits, + CUSTATEVEC_EX_PERMUTATION_SCATTER)); + } + + auto f = [&matrix, &state, &num_local_qubits, &qs2]( + unsigned i, const auto& r) { + void* workspace; + size_t workspace_size; + + ErrorCheck(custatevecComputeExpectationGetWorkspaceSize( + r.custatevec_handle, kStateDataType, num_local_qubits, matrix, + kMatrixDataType, kMatrixLayout, qs2.size(), kComputeType, + &workspace_size)); + + // TODO: reuse allocated memory. + ErrorCheck(cudaMalloc(&workspace, workspace_size)); + + cuDoubleComplex eval; + + ErrorCheck(custatevecComputeExpectation( + r.custatevec_handle, r.device_ptr, kStateDataType, num_local_qubits, + &eval, kExpectDataType, nullptr, matrix, kMatrixDataType, + kMatrixLayout, (int32_t*) qs2.data(), qs2.size(), kComputeType, + workspace, workspace_size)); + + ErrorCheck(cudaFree(workspace)); + + return std::complex{cuCreal(eval), cuCimag(eval)}; + }; + + return state.reduce(f); + } + + /** + * @return The size of SIMD register if applicable. + */ + static unsigned SIMDRegisterSize() { + return 32; + } + + private: +}; + +} // namespace qsim + +#endif // SIMULATOR_CUSTATEVECEX_H_ diff --git a/lib/statespace_custatevecex.h b/lib/statespace_custatevecex.h new file mode 100644 index 000000000..8b27822c7 --- /dev/null +++ b/lib/statespace_custatevecex.h @@ -0,0 +1,431 @@ +// Copyright 2025 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef STATESPACE_CUSTATEVECEX_H_ +#define STATESPACE_CUSTATEVECEX_H_ + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "multiprocess_custatevecex.h" +#include "statespace.h" +#include "util_custatevec.h" +#include "util_custatevecex.h" +#include "vectorspace_custatevecex.h" + +namespace qsim { + +namespace detail { + +template +__global__ void SetStateKernel(FP v, uint64_t size, void* state) { + uint64_t k = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x; + + if (k < size) { + ((FP*) state)[2 * k] = v; + ((FP*) state)[2 * k + 1] = 0; + } +} + +} // namespace detail + +/** + * Object containing context and routines for cuStateVec state-vector + * manipulations. It is not recommended to use `GetAmpl` and `SetAmpl`. + */ +template +class StateSpaceCuStateVecEx : + public StateSpace, VectorSpaceCuStateVecEx, FP> { + private: + using Base = + StateSpace, VectorSpaceCuStateVecEx, FP>; + + public: + using State = typename Base::State; + using fp_type = typename Base::fp_type; + using Parameter = typename Base::Parameter; + + static constexpr auto kStateDataType = Base::kStateDataType; + static constexpr auto kMatrixDataType = kStateDataType; + static constexpr auto kExMatrixType = CUSTATEVEC_EX_MATRIX_DENSE; + static constexpr auto kMatrixLayout = CUSTATEVEC_MATRIX_LAYOUT_ROW; + + explicit StateSpaceCuStateVecEx(const MultiProcessCuStateVecEx& mp, + Parameter param = Parameter{}) + : Base(param, mp) {} + + static uint64_t MinSize(unsigned num_qubits) { + return 2 * (uint64_t{1} << num_qubits); + }; + + void InternalToNormalOrder(State& state) const { + state.to_normal_order(); + } + + void NormalToInternalOrder(State& state) const { + } + + void SetAllZeros(State& state) const { + uint64_t size = (uint64_t{1} << state.num_qubits()) / state.num_substates(); + + auto f = [&size](unsigned i, const auto& r) { + unsigned threads = size < 256 ? size : 256; + unsigned blocks = size / threads; + fp_type zero = 0.0; + detail::SetStateKernel<<>>(zero, size, r.device_ptr); + }; + + state.assign(f); + } + + // Uniform superposition. + void SetStateUniform(State& state) const { + uint64_t size = uint64_t{1} << state.num_qubits(); + fp_type v = double{1} / std::sqrt(size); + size /= state.num_substates(); + + auto f = [&size, &v](unsigned i, const auto& r) { + unsigned threads = size < 256 ? size : 256; + unsigned blocks = size / threads; + detail::SetStateKernel<<>>(v, size, r.device_ptr); + }; + + state.assign(f); + } + + // |0> state. + void SetStateZero(State& state) const { + ErrorCheck((custatevecExStateVectorSetZeroState(state.get()))); + } + + // It is not recommended to use this function. + std::complex GetAmpl(const State& state, uint64_t i) const { + fp_type buf[2] = {0, 0}; + + uint64_t k = 0; + const auto& wire_ordering = state.get_wire_ordering(); + for (unsigned j = 0; j < state.num_qubits(); ++j) { + k |= ((i >> wire_ordering[j]) & 1) << j; + } + + uint64_t size = (uint64_t{1} << state.num_qubits()) / state.num_substates(); + unsigned required_rank = k / size; + + if (state.distr_type() != Base::kMultiProcess + || Base::mp.rank() == required_rank) { + ErrorCheck(custatevecExStateVectorGetState( + state.get(), buf, kStateDataType, k, k + 1, 1)); + } + + ErrorCheck(custatevecExStateVectorSynchronize(state.get())); + + if (state.distr_type() == Base::kMultiProcess) { + auto cuda_type = GetCudaType>(); + auto comm = Base::mp.communicator(); + ErrorCheck(comm->intf->bcast(comm, buf, 1, cuda_type, required_rank)); + } + + return {buf[0], buf[1]}; + } + + // It is not recommended to use this function. + void SetAmpl( + State& state, uint64_t i, const std::complex& ampl) const { + fp_type buf[2] = {std::real(ampl), std::imag(ampl)}; + + uint64_t k = 0; + const auto& wire_ordering = state.get_wire_ordering(); + for (unsigned j = 0; j < state.num_qubits(); ++j) { + k |= ((i >> wire_ordering[j]) & 1) << j; + } + + uint64_t size = (uint64_t{1} << state.num_qubits()) / state.num_substates(); + unsigned required_rank = k / size; + + if (state.distr_type() != Base::kMultiProcess + || Base::mp.rank() == required_rank) { + ErrorCheck(custatevecExStateVectorSetState( + state.get(), buf, kStateDataType, k, k + 1, 1)); + } + + ErrorCheck(custatevecExStateVectorSynchronize(state.get())); + } + + // It is not recommended to use this function. + void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) const { + fp_type buf[2] = {re, im}; + + uint64_t k = 0; + const auto& wire_ordering = state.get_wire_ordering(); + for (unsigned j = 0; j < state.num_qubits(); ++j) { + k |= ((i >> wire_ordering[j]) & 1) << j; + } + + uint64_t size = (uint64_t{1} << state.num_qubits()) / state.num_substates(); + unsigned required_rank = k / size; + + if (state.distr_type() != Base::kMultiProcess + || Base::mp.rank() == required_rank) { + ErrorCheck(custatevecExStateVectorSetState( + state.get(), buf, kStateDataType, k, k + 1, 1)); + } + + ErrorCheck(custatevecExStateVectorSynchronize(state.get())); + } + + // Sets state[i] = complex(re, im) where (i & mask) == bits. + // if `exclude` is true then the criteria becomes (i & mask) != bits. + static void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, + const std::complex& val, + bool exclude = false) { + // Not implemented. + } + + // Sets state[i] = complex(re, im) where (i & mask) == bits. + // if `exclude` is true then the criteria becomes (i & mask) != bits. + static void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re, + fp_type im, bool exclude = false) { + // Not implemented. + } + + // Does the equivalent of dest += src elementwise. + bool Add(const State& src, State& dest) const { + if (src.num_qubits() != dest.num_qubits()) { + return false; + } + + uint64_t size = (uint64_t{1} << src.num_qubits()) / src.num_substates(); + + auto f = [&size](unsigned i, const auto& rd, const auto& rs) { + cublasHandle_t cublas_handle; + ErrorCheck(cublasCreate(&cublas_handle)); + ErrorCheck(cublasSetStream(cublas_handle, rd.stream)); + + if (Base::is_float) { + cuComplex a = {1.0, 0.0}; + auto p1 = (const cuComplex*) rs.device_ptr; + auto p2 = (cuComplex*) rd.device_ptr; + ErrorCheck(cublasCaxpy(cublas_handle, size, &a, p1, 1, p2, 1)); + } else { + cuDoubleComplex a = {1.0, 0.0}; + auto p1 = (const cuDoubleComplex*) rs.device_ptr; + auto p2 = (cuDoubleComplex*) rd.device_ptr; + ErrorCheck(cublasZaxpy(cublas_handle, size, &a, p1, 1, p2, 1)); + } + + ErrorCheck(cudaStreamSynchronize(rd.stream)); + ErrorCheck(cublasDestroy(cublas_handle)); + }; + + dest.assign(src, f); + + return true; + } + + // Does the equivalent of state *= a elementwise. + static void Multiply(fp_type a, State& state) { + uint64_t size = (uint64_t{1} << state.num_qubits()) / state.num_substates(); + + auto f = [&a, &size](unsigned i, const auto& r) { + cublasHandle_t cublas_handle; + ErrorCheck(cublasCreate(&cublas_handle)); + ErrorCheck(cublasSetStream(cublas_handle, r.stream)); + + if (Base::is_float) { + float a1 = a; + auto p = (cuComplex*) r.device_ptr; + ErrorCheck(cublasCsscal(cublas_handle, size, &a1, p, 1)); + } else { + double a1 = a; + auto p = (cuDoubleComplex*) r.device_ptr; + ErrorCheck(cublasZdscal(cublas_handle, size, &a1, p, 1)); + } + + ErrorCheck(cudaStreamSynchronize(r.stream)); + ErrorCheck(cublasDestroy(cublas_handle)); + }; + + return state.assign(f); + } + + // Does the equivalent of state *= (re + i im) elementwise. + static void Multiply(fp_type re, fp_type im, State& state) { + uint64_t size = (uint64_t{1} << state.num_qubits()) / state.num_substates(); + + auto f = [&re, &im, &size](unsigned i, const auto& r) { + cublasHandle_t cublas_handle; + ErrorCheck(cublasCreate(&cublas_handle)); + ErrorCheck(cublasSetStream(cublas_handle, r.stream)); + + if (Base::is_float) { + cuComplex a = {float(re), float(im)}; + auto p = (cuComplex*) r.device_ptr; + ErrorCheck(cublasCscal(cublas_handle, size, &a, p, 1)); + } else { + cuDoubleComplex a = {re, im}; + auto p = (cuDoubleComplex*) r.device_ptr; + ErrorCheck(cublasZscal(cublas_handle, size, &a, p, 1)); + } + + ErrorCheck(cudaStreamSynchronize(r.stream)); + ErrorCheck(cublasDestroy(cublas_handle)); + }; + + return state.assign(f); + } + + static std::complex InnerProduct( + const State& state1, const State& state2) { + if (state1.num_qubits() != state2.num_qubits()) { + return std::nan(""); + } + + uint64_t size = + (uint64_t{1} << state1.num_qubits()) / state1.num_substates(); + + auto f = [&size](unsigned i, const auto& r1, const auto& r2) { + cublasHandle_t cublas_handle; + ErrorCheck(cublasCreate(&cublas_handle)); + ErrorCheck(cublasSetStream(cublas_handle, r1.stream)); + + if (Base::is_float) { + cuComplex result; + auto p1 = (const cuComplex*) r1.device_ptr; + auto p2 = (const cuComplex*) r2.device_ptr; + ErrorCheck(cublasCdotc(cublas_handle, size, p1, 1, p2, 1, &result)); + return std::complex{cuCrealf(result), cuCimagf(result)}; + } else { + cuDoubleComplex result; + auto p1 = (const cuDoubleComplex*) r1.device_ptr; + auto p2 = (const cuDoubleComplex*) r2.device_ptr; + ErrorCheck(cublasZdotc(cublas_handle, size, p1, 1, p2, 1, &result)); + return std::complex{cuCreal(result), cuCimag(result)}; + } + + ErrorCheck(cudaStreamSynchronize(r1.stream)); + ErrorCheck(cublasDestroy(cublas_handle)); + }; + + return state1.reduce(state2, f); + } + + double RealInnerProduct(const State& state1, const State& state2) const { + return std::real(InnerProduct(state1, state2)); + } + + double Norm(const State& state) const { + double norm; + + ErrorCheck(custatevecExAbs2SumArray( + state.get(), &norm, nullptr, 0, nullptr, nullptr, 0)); + ErrorCheck(custatevecExStateVectorSynchronize(state.get())); + + return norm; + } + + template + std::vector Sample( + const State& state, uint64_t num_samples, unsigned seed) const { + std::vector bitstrings; + + if (num_samples > 0) { + auto rs = GenerateRandomValues(num_samples, seed, 1.0); + + std::vector bitstrings0(num_samples); + + std::vector wires; + wires.reserve(state.num_qubits()); + for (unsigned i = 0; i < state.num_qubits(); ++i) { + wires[i] = i; + } + + ErrorCheck(custatevecExSample( + state.get(), bitstrings0.data(), wires.data(), state.num_qubits(), + rs.data(), num_samples, CUSTATEVEC_SAMPLER_OUTPUT_RANDNUM_ORDER, + nullptr)); + ErrorCheck(custatevecExStateVectorSynchronize(state.get())); + + bitstrings.reserve(num_samples); + for (unsigned i = 0; i < num_samples; ++i) { + bitstrings.push_back(bitstrings0[i]); + } + } + + return bitstrings; + } + + using MeasurementResult = typename Base::MeasurementResult; + + template + MeasurementResult Measure(const std::vector& qubits, + RGen& rgen, State& state, + bool no_collapse = false) const { + auto r = RandomValue(rgen, 1.0); + + MeasurementResult result; + + result.valid = true; + result.mask = 0; + result.bits = 0; + result.bitstring.resize(qubits.size(), 0); + + for (auto q : qubits) { + if (q >= state.num_qubits()) { + result.valid = false; + return result; + } + + result.mask |= uint64_t{1} << q; + } + + auto collapse = no_collapse ? + CUSTATEVEC_COLLAPSE_NONE : CUSTATEVEC_COLLAPSE_NORMALIZE_AND_ZERO; + + custatevecIndex_t bits; + + ErrorCheck(custatevecExMeasure( + state.get(), &bits, (int32_t*) qubits.data(), qubits.size(), + r, collapse, nullptr)); + ErrorCheck(custatevecExStateVectorSynchronize(state.get())); + + for (std::size_t i = 0; i < qubits.size(); ++i) { + uint64_t bit = (bits >> i) & 1; + result.bitstring[i] = bit; + result.bits |= bit << qubits[i]; + } + + return result; + } + + template + MeasurementResult VirtualMeasure(const std::vector& qubits, + RGen& rgen, const State& state) const { + return Measure(qubits, rgen, const_cast(state), true); + } + + void Collapse(const MeasurementResult& mr, State& state) const { + // Not implemented. + } +}; + +} // namespace qsim + +#endif // STATESPACE_CUSTATEVECEX_H_ diff --git a/lib/util_cuda.h b/lib/util_cuda.h index 5d8cb5df3..b34292753 100644 --- a/lib/util_cuda.h +++ b/lib/util_cuda.h @@ -22,6 +22,7 @@ #endif #include +#include #include "io.h" @@ -31,11 +32,27 @@ namespace qsim { inline void ErrorAssert(cudaError_t code, const char* file, unsigned line) { if (code != cudaSuccess) { - IO::errorf("CUDA error: %s %s %d\n", cudaGetErrorString(code), file, line); + IO::errorf( + "CUDA error: %s at %s %d\n", cudaGetErrorString(code), file, line); exit(code); } } +template +inline auto GetCudaType() { + if (std::is_same_v) { + return CUDA_R_32F; + } else if (std::is_same_v) { + return CUDA_R_64F; + } else if (std::is_same_v>) { + return CUDA_C_32F; + } else if (std::is_same_v>) { + return CUDA_C_64F; + } + + return CUDA_C_64F; +} + template struct Complex { __host__ __device__ __forceinline__ Complex() {} diff --git a/lib/util_custatevec.h b/lib/util_custatevec.h index 36f29efab..d37858b40 100644 --- a/lib/util_custatevec.h +++ b/lib/util_custatevec.h @@ -25,7 +25,7 @@ namespace qsim { inline void ErrorAssert(cublasStatus_t code, const char* file, unsigned line) { if (code != CUBLAS_STATUS_SUCCESS) { - IO::errorf("cuBLAS error %i: %s %d\n", code, file, line); + IO::errorf("cuBLAS error %d at %s %d\n", code, file, line); exit(code); } } @@ -33,7 +33,7 @@ inline void ErrorAssert(cublasStatus_t code, const char* file, unsigned line) { inline void ErrorAssert( custatevecStatus_t code, const char* file, unsigned line) { if (code != CUSTATEVEC_STATUS_SUCCESS) { - IO::errorf("custatevec error: %s %s %d\n", + IO::errorf("cuStateVec error: %s at %s %d\n", custatevecGetErrorString(code), file, line); exit(code); } diff --git a/lib/util_custatevecex.h b/lib/util_custatevecex.h new file mode 100644 index 000000000..ab57d7e85 --- /dev/null +++ b/lib/util_custatevecex.h @@ -0,0 +1,46 @@ +// Copyright 2025 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef UTIL_CUSTATEVECEX_H_ +#define UTIL_CUSTATEVECEX_H_ + +#include +#include + +#include "io.h" +#include "util_cuda.h" + +namespace qsim { + +inline void ErrorAssert( + custatevecExCommunicatorStatus_t code, const char* file, unsigned line) { + if (code != CUSTATEVEC_EX_COMMUNICATOR_STATUS_SUCCESS) { + IO::errorf( + "cuStateVecEx communicator error %d at %s %d\n", code, file, line); + exit(code); + } +} + +inline unsigned get_num_global_qubits(unsigned num_devices) { + unsigned num_global_qubits = 0; + while ((num_devices >>= 1) > 0) { + ++num_global_qubits; + } + + return num_global_qubits; +} + +} // namespace qsim + +#endif // UTIL_CUSTATEVECEX_H_ diff --git a/lib/vectorspace_custatevecex.h b/lib/vectorspace_custatevecex.h new file mode 100644 index 000000000..ed72c403b --- /dev/null +++ b/lib/vectorspace_custatevecex.h @@ -0,0 +1,611 @@ +// Copyright 2025 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef VECTORSPACE_CUSTATEVECEX_H_ +#define VECTORSPACE_CUSTATEVECEX_H_ + +#include +#include + +#include + +#include +#include +#include +#include + +#include "io.h" +#include "multiprocess_custatevecex.h" +#include "util_cuda.h" +#include "util_custatevec.h" +#include "util_custatevecex.h" + +namespace qsim { + +namespace detail { + +inline void free(void* ptr) {} + +} // namespace detail + +// Routines for vector manipulations. +template +class VectorSpaceCuStateVecEx { + public: + using fp_type = FP; + + static constexpr auto is_float = std::is_same::value; + static constexpr auto kStateDataType = is_float ? CUDA_C_32F : CUDA_C_64F; + + enum DistributionType { + kNoDistr, + kSingleDevice, + kMultiDevice, + kMultiProcess, + }; + + enum DeviceNetworkType { + kSwitch = 0, + kFullMesh = 1, + }; + + struct Parameter { + unsigned num_devices = 0; + DeviceNetworkType device_network_type = kSwitch; + unsigned verbosity = 0; + }; + + class Vector { + public: + struct CuStateVecResources { + int32_t device_id = -1; + void* device_ptr = nullptr; + cudaStream_t stream = nullptr; + custatevecHandle_t custatevec_handle = nullptr; + }; + + Vector(const Vector&) = delete; + Vector& operator=(const Vector&) = delete; + + Vector() : mp_(nullptr), ptr_(nullptr), + num_qubits_(0), num_substates_(0), distr_type_(kNoDistr) {} + + Vector(const MultiProcessCuStateVecEx* mp, + custatevecExStateVectorDescriptor_t ptr, unsigned num_qubits, + unsigned num_substates, DistributionType distr_type) + : mp_(mp), ptr_(ptr), wire_ordering_(num_qubits), + num_qubits_(num_qubits), num_substates_(num_substates), + distr_type_(distr_type) {} + + Vector(Vector&& r) : mp_(r.mp_), ptr_(r.ptr_), + wire_ordering_(std::move(r.wire_ordering_)), + num_qubits_(r.num_qubits_), num_substates_(r.num_substates_), + distr_type_(r.distr_type_) { + r.mp_ = nullptr; + r.ptr_ = nullptr; + r.num_qubits_ = 0; + r.num_substates_ = 0; + r.distr_type_ = kNoDistr; + } + + ~Vector() { + if (ptr_ != nullptr) { + ErrorCheck(custatevecExStateVectorDestroy(ptr_)); + } + } + + Vector& operator=(Vector&& r) { + if (this != &r) { + mp_ = r.mp_; + ptr_ = r.ptr_; + wire_ordering_ = std::move(r.wire_ordering_); + num_qubits_ = r.num_qubits_; + num_substates_ = r.num_substates_; + distr_type_ = r.distr_type_; + + r.mp_ = nullptr; + r.ptr_ = nullptr; + r.num_qubits_ = 0; + r.num_substates_ = 0; + r.distr_type_ = kNoDistr; + } + + return *this; + } + + auto get() { + return ptr_; + } + + const auto get() const { + return ptr_; + } + + custatevecExStateVectorDescriptor_t release() { + auto ptr = ptr_; + + mp_ = nullptr; + ptr_ = nullptr; + num_qubits_ = 0; + num_substates_ = 0; + distr_type_ = kNoDistr; + + return ptr; + } + + unsigned num_qubits() const { + return num_qubits_; + } + + unsigned num_substates() const { + return num_substates_; + } + + DistributionType distr_type() const { + return distr_type_; + } + + static constexpr bool requires_copy_to_host() { + return true; + } + + const auto& get_wire_ordering() const { + ErrorCheck(custatevecExStateVectorGetProperty( + ptr_, CUSTATEVEC_EX_SV_PROP_WIRE_ORDERING, + const_cast(wire_ordering_.data()), + sizeof(int32_t) * num_qubits_)); + + return wire_ordering_; + } + + void to_normal_order() const { + const auto& wire_ordering = get_wire_ordering(); + + ErrorCheck(custatevecExStateVectorPermuteIndexBits( + ptr_, wire_ordering.data(), num_qubits_, + CUSTATEVEC_EX_PERMUTATION_SCATTER)); + } + + CuStateVecResources get_resources(unsigned substate_index) const { + CuStateVecResources r; + + ErrorCheck(custatevecExStateVectorGetResourcesFromDeviceSubSV( + ptr_, substate_index, &r.device_id, &r.device_ptr, &r.stream, + &r.custatevec_handle)); + + return r; + } + + template + void assign(Callback&& callback) const { + if (distr_type_ == kMultiProcess) { + unsigned num_devices = 1; + std::vector substate_indices(num_devices); + + ErrorCheck(custatevecExStateVectorGetProperty( + ptr_, CUSTATEVEC_EX_SV_PROP_DEVICE_SUBSV_INDICES, + substate_indices.data(), num_devices * sizeof(int32_t))); + + unsigned k = substate_indices[0]; + auto res = get_resources(k); + + ErrorCheck(cudaSetDevice(res.device_id)); + + callback(k, res); + } else { + if (num_substates_ == 1) { + callback(0, get_resources(0)); + } else { + std::vector substate_indices(num_substates_); + ErrorCheck(custatevecExStateVectorGetProperty( + ptr_, CUSTATEVEC_EX_SV_PROP_DEVICE_SUBSV_INDICES, + substate_indices.data(), num_substates_ * sizeof(int32_t))); + + for (unsigned i = 0; i < num_substates_; ++i) { + unsigned k = substate_indices[i]; + auto res = get_resources(k); + + ErrorCheck(cudaSetDevice(res.device_id)); + + callback(k, res); + } + } + } + } + + template + auto reduce(Callback&& callback) const { + using ResultType = std::invoke_result_t; + + if (distr_type_ == kMultiProcess) { + unsigned num_devices = 1; + std::vector substate_indices(num_devices); + + ErrorCheck(custatevecExStateVectorGetProperty( + ptr_, CUSTATEVEC_EX_SV_PROP_DEVICE_SUBSV_INDICES, + substate_indices.data(), num_devices * sizeof(int32_t))); + + unsigned k = substate_indices[0]; + auto res = get_resources(k); + + ErrorCheck(cudaSetDevice(res.device_id)); + + ResultType r; + ResultType local_r = callback(k, res); + + auto cuda_type = GetCudaType(); + auto comm = mp_->communicator(); + ErrorCheck(comm->intf->allreduce(comm, &local_r, &r, 1, cuda_type)); + + return r; + } else { + if (num_substates_ == 1) { + return callback(0, get_resources(0)); + } else { + std::vector substate_indices(num_substates_); + ErrorCheck(custatevecExStateVectorGetProperty( + ptr_, CUSTATEVEC_EX_SV_PROP_DEVICE_SUBSV_INDICES, + substate_indices.data(), num_substates_ * sizeof(int32_t))); + + ResultType r = 0; + + for (unsigned i = 0; i < num_substates_; ++i) { + unsigned k = substate_indices[i]; + auto res = get_resources(k); + + ErrorCheck(cudaSetDevice(res.device_id)); + + r += callback(k, res); + } + + return r; + } + } + } + + template + void assign(const Vector& vec, Callback&& callback) const { + if (distr_type_ == kMultiProcess) { + unsigned num_devices = 1; + std::vector substate_indices(num_devices); + + ErrorCheck(custatevecExStateVectorGetProperty( + ptr_, CUSTATEVEC_EX_SV_PROP_DEVICE_SUBSV_INDICES, + substate_indices.data(), num_devices * sizeof(int32_t))); + + unsigned k = substate_indices[0]; + auto res1 = get_resources(k); + auto res2 = vec.get_resources(k); + + ErrorCheck(cudaSetDevice(res1.device_id)); + + callback(k, res1, res2); + } else { + if (num_substates_ == 1) { + callback(0, get_resources(0), vec.get_resources(0)); + } else { + std::vector substate_indices(num_substates_); + ErrorCheck(custatevecExStateVectorGetProperty( + ptr_, CUSTATEVEC_EX_SV_PROP_DEVICE_SUBSV_INDICES, + substate_indices.data(), num_substates_ * sizeof(int32_t))); + + for (unsigned i = 0; i < num_substates_; ++i) { + unsigned k = substate_indices[i]; + auto res1 = get_resources(k); + auto res2 = vec.get_resources(k); + + ErrorCheck(cudaSetDevice(res1.device_id)); + + callback(k, res1, res2); + } + } + } + } + + template + auto reduce(const Vector& vec, Callback&& callback) const { + using ResultType = std::invoke_result_t; + + if (distr_type_ == kMultiProcess) { + unsigned num_devices = 1; + std::vector substate_indices(num_devices); + + ErrorCheck(custatevecExStateVectorGetProperty( + ptr_, CUSTATEVEC_EX_SV_PROP_DEVICE_SUBSV_INDICES, + substate_indices.data(), num_devices * sizeof(int32_t))); + + unsigned k = substate_indices[0]; + auto res1 = get_resources(k); + auto res2 = vec.get_resources(k); + + ErrorCheck(cudaSetDevice(res2.device_id)); + ErrorCheck(cudaStreamSynchronize(res2.stream)); + + ResultType r; + ResultType local_r = callback(k, res1, res2); + + auto cuda_type = GetCudaType(); + auto comm = mp_->communicator(); + ErrorCheck(comm->intf->allreduce(comm, &local_r, &r, 1, cuda_type)); + + return r; + } else { + if (num_substates_ == 1) { + return callback(0, get_resources(0), vec.get_resources(0)); + } else { + std::vector substate_indices(num_substates_); + ErrorCheck(custatevecExStateVectorGetProperty( + ptr_, CUSTATEVEC_EX_SV_PROP_DEVICE_SUBSV_INDICES, + substate_indices.data(), num_substates_ * sizeof(int32_t))); + + ResultType r = 0; + + for (unsigned i = 0; i < num_substates_; ++i) { + unsigned k = substate_indices[i]; + auto res1 = get_resources(k); + auto res2 = vec.get_resources(k); + + ErrorCheck(cudaSetDevice(res2.device_id)); + ErrorCheck(cudaStreamSynchronize(res2.stream)); + + r += callback(k, res1, res2); + } + + return r; + } + } + } + + private: + const MultiProcessCuStateVecEx* mp_; + custatevecExStateVectorDescriptor_t ptr_; + std::vector wire_ordering_; + unsigned num_qubits_; + unsigned num_substates_; + DistributionType distr_type_; + }; + + VectorSpaceCuStateVecEx(const Parameter& param, + const MultiProcessCuStateVecEx& mp) + : param(param), mp(mp) {} + + Vector Create(unsigned num_qubits) const { + custatevecExStateVectorDescriptor_t state_vec; + custatevecExDictionaryDescriptor_t sv_config + = mp.create_sv_config(num_qubits, kStateDataType); + + unsigned num_substates = 1; + DistributionType distr_type = kNoDistr; + + if (sv_config != nullptr) { + ErrorCheck(custatevecExStateVectorCreateMultiProcess( + &state_vec, sv_config, nullptr, mp.communicator(), nullptr)); + + num_substates = mp.num_processes(); + distr_type = kMultiProcess; + + if (param.verbosity > 2) { + unsigned num_global_qubits = get_num_global_qubits(num_substates); + IO::messagef("multi-process mode: %u %u.\n", + num_qubits, num_global_qubits); + } + } else { + num_substates = param.num_devices; + + if (num_qubits < 3) { + num_substates = 1; + } else if (num_substates == 0) { + int count = 0; + ErrorCheck(cudaGetDeviceCount(&count)); + num_substates = count; + } + + if (num_substates == 1) { + ErrorCheck(custatevecExConfigureStateVectorSingleDevice( + &sv_config, kStateDataType, num_qubits, num_qubits, -1, 0)); + + distr_type = kSingleDevice; + + if (param.verbosity > 2) { + IO::messagef("single device mode.\n"); + } + } else { + unsigned num_global_qubits = get_num_global_qubits(num_substates); + + while (num_global_qubits + 2 > num_qubits && num_substates > 1) { + num_substates /= 2; + --num_global_qubits; + } + + if (num_substates == 1) { + ErrorCheck(custatevecExConfigureStateVectorSingleDevice( + &sv_config, kStateDataType, num_qubits, num_qubits, -1, 0)); + + distr_type = kSingleDevice; + + if (param.verbosity > 2) { + IO::messagef("single-device mode (too few qubits).\n"); + } + } else { + std::vector device_ids(num_substates); + for (unsigned i = 0; i < num_substates; ++i) { + device_ids[i] = i; + } + + unsigned num_local_qubits = num_qubits - num_global_qubits; + + auto device_network_type = + get_device_network_type(param.device_network_type); + + ErrorCheck(custatevecExConfigureStateVectorMultiDevice( + &sv_config, kStateDataType, num_qubits, num_local_qubits, + device_ids.data(), num_substates, device_network_type, 0)); + + distr_type = kMultiDevice; + + if (param.verbosity > 2) { + IO::messagef("multi-device mode: %u %u.\n", + num_qubits, num_global_qubits); + } + } + } + + ErrorCheck(custatevecExStateVectorCreateSingleProcess( + &state_vec, sv_config, nullptr, 0, nullptr)); + } + + ErrorCheck(custatevecExDictionaryDestroy(sv_config)); + + return Vector{&mp, state_vec, num_qubits, num_substates, distr_type}; + } + + static Vector Null() { + return Vector{nullptr, nullptr, 0, 0, kNoDistr}; + } + + static bool IsNull(const Vector& vector) { + return vector.get() == nullptr; + } + + bool Copy(const Vector& src, Vector& dest) const { + if (src.num_qubits() != dest.num_qubits()) { + return false; + } + + uint64_t size = (uint64_t{1} << src.num_qubits()) / src.num_substates(); + + auto f = [&size](unsigned i, const auto& rd, const auto& rs) { + ErrorCheck(cudaMemcpy( + rd.device_ptr, rs.device_ptr, 2 * sizeof(fp_type) * size, + cudaMemcpyDeviceToDevice)); + }; + + dest.assign(src, f); + + return true; + } + + // It is the client's responsibility to make sure that dest has at least + // 2^src.num_qubits() elements. + bool Copy(const Vector& src, fp_type* dest) const { + if (src.distr_type() == kMultiProcess) { + uint64_t size = (uint64_t{1} << src.num_qubits()) / src.num_substates(); + uint64_t offset = size * mp.rank(); + + ErrorCheck(custatevecExStateVectorGetState( + src.get(), dest + 2 * offset, kStateDataType, + offset, offset + size, 1)); + ErrorCheck(custatevecExStateVectorSynchronize(src.get())); + + auto cuda_type = GetCudaType>(); + auto comm = mp.communicator(); + ErrorCheck(comm->intf->allgather( + comm, dest + 2 * offset, dest, size, cuda_type)); + } else { + uint64_t size = uint64_t{1} << src.num_qubits(); + ErrorCheck(custatevecExStateVectorGetState( + src.get(), dest, kStateDataType, 0, size, 1)); + ErrorCheck(custatevecExStateVectorSynchronize(src.get())); + } + + return true; + } + + // It is the client's responsibility to make sure that src has at least + // 2^dest.num_qubits() elements. + bool Copy(const fp_type* src, Vector& dest) const { + if (dest.distr_type() == kMultiProcess) { + uint64_t size = (uint64_t{1} << dest.num_qubits()) / dest.num_substates(); + uint64_t offset = size * mp.rank(); + + ErrorCheck(custatevecExStateVectorSetState( + dest.get(), src + 2 * offset, kStateDataType, + offset, offset + size, 1)); + } else { + uint64_t size = uint64_t{1} << dest.num_qubits(); + ErrorCheck(custatevecExStateVectorSetState( + dest.get(), src, kStateDataType, 0, size, 1)); + } + + ErrorCheck(custatevecExStateVectorSynchronize(dest.get())); + + // TODO: do we need that? + dest.to_normal_order(); + + return true; + } + + // It is the client's responsibility to make sure that src has at least + // 2^dest.num_qubits() elements. + bool Copy(const fp_type* src, uint64_t size, Vector& dest) const { + size = size / 2; + + if (size != (uint64_t{1} << dest.num_qubits())) { + IO::errorf("wrong size in VectorSpaceCuStateVecEx::Copy.\n"); + return false; + } + + if (dest.distr_type() == kMultiProcess) { + size /= dest.num_substates(); + uint64_t offset = size * mp.rank(); + + ErrorCheck(custatevecExStateVectorSetState( + dest.get(), src + 2 * offset, kStateDataType, + offset, offset + size, 1)); + } else { + ErrorCheck(custatevecExStateVectorSetState( + dest.get(), src, kStateDataType, 0, size, 1)); + } + + ErrorCheck(custatevecExStateVectorSynchronize(dest.get())); + + // TODO: do we need that? + dest.to_normal_order(); + + return true; + } + + static void DeviceSync() { + ErrorCheck(cudaDeviceSynchronize()); + } + + protected: + Parameter param; + const MultiProcessCuStateVecEx& mp; + + private: + static custatevecDeviceNetworkType_t get_device_network_type( + DeviceNetworkType id) { + custatevecDeviceNetworkType_t device_network_type = + CUSTATEVEC_DEVICE_NETWORK_TYPE_SWITCH; + + switch (id) { + case kSwitch: + device_network_type = CUSTATEVEC_DEVICE_NETWORK_TYPE_SWITCH; + break; + case kFullMesh: + device_network_type = CUSTATEVEC_DEVICE_NETWORK_TYPE_FULLMESH; + break; + } + + return device_network_type; + } +}; + +} // namespace qsim + +#endif // VECTORSPACE_CUSTATEVECEX_H_ diff --git a/pybind_interface/Makefile b/pybind_interface/Makefile index 9bcc54b63..f9693c82c 100644 --- a/pybind_interface/Makefile +++ b/pybind_interface/Makefile @@ -21,6 +21,7 @@ QSIMLIB_AVX2 = ../qsimcirq/qsim_avx2$(SUFFIX) QSIMLIB_AVX512 = ../qsimcirq/qsim_avx512$(SUFFIX) QSIMLIB_CUDA = ../qsimcirq/qsim_cuda$(SUFFIX) QSIMLIB_CUSTATEVEC = ../qsimcirq/qsim_custatevec$(SUFFIX) +QSIMLIB_CUSTATEVECEX = ../qsimcirq/qsim_custatevecex$(SUFFIX) QSIMLIB_HIP = ../qsimcirq/qsim_hip$(SUFFIX) QSIMLIB_DECIDE = ../qsimcirq/qsim_decide$(SUFFIX) @@ -66,7 +67,7 @@ else ifeq ($(CUQUANTUM_ROOT),) pybind: pybind-cpu pybind-cuda decide-cuda else -pybind: pybind-cpu pybind-cuda pybind-custatevec decide-custatevec +pybind: pybind-cpu pybind-cuda pybind-custatevec pybind-custatevecex decide-custatevec endif endif @@ -94,9 +95,13 @@ decide-cuda: pybind-custatevec: $(NVCC) custatevec/pybind_main_custatevec.cpp -o $(QSIMLIB_CUSTATEVEC) $(NVCCFLAGS) $(PYBINDFLAGS_CUSTATEVEC) +.PHONY: pybind-custatevecex +pybind-custatevecex: + $(NVCC) custatevecex/pybind_main_custatevecex.cpp -o $(QSIMLIB_CUSTATEVECEX) $(NVCCFLAGS) $(PYBINDFLAGS_CUSTATEVEC) + .PHONY: decide-custatevec decide-custatevec: - $(NVCC) decide/decide.cpp -D__CUSTATEVEC__ -o $(QSIMLIB_DECIDE) $(NVCCFLAGS) $(PYBINDFLAGS_CUDA) + $(NVCC) decide/decide.cpp -D__CUSTATEVEC__ -D__CUSTATEVECEX__ -o $(QSIMLIB_DECIDE) $(NVCCFLAGS) $(PYBINDFLAGS_CUDA) .PHONY: pybind-hip pybind-hip: @@ -119,4 +124,5 @@ clean: -rm -f ./cuda/*.x ./cuda/*.a ./cuda/*.so ./cuda/*.mod $(QSIMLIB_CUDA) -rm -f ./hip/*.x ./hip/*.a ./hip/*.so ./hip/*.mod $(QSIMLIB_HIP) -rm -f ./custatevec/*.x ./custatevec/*.a ./custatevec/*.so ./custatevec/*.mod $(QSIMLIB_CUSTATEVEC) + -rm -f ./custatevecex/*.x ./custatevecex/*.a ./custatevecex/*.so ./custatevecex/*.mod $(QSIMLIB_CUSTATEVECEX) -rm -f ./decide/*.x ./decide/*.a ./decide/*.so ./decide/*.mod $(QSIMLIB_DECIDE) diff --git a/pybind_interface/cuda/CMakeLists.txt b/pybind_interface/cuda/CMakeLists.txt index 6ef6be3cf..39d0b8bba 100644 --- a/pybind_interface/cuda/CMakeLists.txt +++ b/pybind_interface/cuda/CMakeLists.txt @@ -22,7 +22,7 @@ if(WIN32) # This prevents a conflict with /RTC1 in DEBUG builds. add_compile_options($<$>:/O2>) else() - add_compile_options(-O3 -flto=auto) + add_compile_options(-O3 -fno-lto) endif() if(APPLE) diff --git a/pybind_interface/custatevec/CMakeLists.txt b/pybind_interface/custatevec/CMakeLists.txt index 2bdd34c12..e4a5f808c 100644 --- a/pybind_interface/custatevec/CMakeLists.txt +++ b/pybind_interface/custatevec/CMakeLists.txt @@ -21,7 +21,7 @@ if(WIN32) # This prevents a conflict with /RTC1 in DEBUG builds. add_compile_options($<$>:/O2>) else() - add_compile_options(-O3 -flto=auto) + add_compile_options(-O3 -fno-lto) endif() if(APPLE) diff --git a/pybind_interface/custatevecex/CMakeLists.txt b/pybind_interface/custatevecex/CMakeLists.txt new file mode 100644 index 000000000..f25216af4 --- /dev/null +++ b/pybind_interface/custatevecex/CMakeLists.txt @@ -0,0 +1,59 @@ +# Copyright 2025 Google LLC. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +cmake_minimum_required(VERSION 3.28) +project(qsim LANGUAGES CXX CUDA) + +if(WIN32) + add_compile_options(/openmp) + # Add /O2 to any configuration that is NOT Debug. + # This prevents a conflict with /RTC1 in DEBUG builds. + add_compile_options($<$>:/O2>) +else() + add_compile_options(-O3 -fno-lto) +endif() + +if(APPLE) + include_directories( + "/usr/local/include" + "/usr/local/opt/llvm/include" + "/opt/homebrew/include" + "/opt/homebrew/opt/llvm@19/include" + ) + link_directories( + "/usr/local/lib" + "/usr/local/opt/llvm/lib" + "/opt/homebrew/lib" + "/opt/homebrew/opt/llvm@19/lib" + ) +endif() + +include(../GetPybind11.cmake) +find_package(Python3 3.10 REQUIRED) + +include_directories(${pybind11_INCLUDE_DIRS}) + +include_directories($ENV{CUQUANTUM_ROOT}/include) +link_directories($ENV{CUQUANTUM_ROOT}/lib $ENV{CUQUANTUM_ROOT}/lib64) + +add_library(qsim_custatevecex MODULE pybind_main_custatevecex.cpp) +target_link_libraries(qsim_custatevecex -lcustatevec -lcublas) + +set_target_properties(qsim_custatevecex PROPERTIES + PREFIX "${PYTHON_MODULE_PREFIX}" + SUFFIX "${PYTHON_MODULE_EXTENSION}" +) +set_source_files_properties(pybind_main_custatevecex.cpp PROPERTIES LANGUAGE CUDA) + +target_link_libraries(qsim_custatevecex OpenMP::OpenMP_CXX) diff --git a/pybind_interface/custatevecex/pybind_main_custatevecex.cpp b/pybind_interface/custatevecex/pybind_main_custatevecex.cpp new file mode 100644 index 000000000..c29a608a6 --- /dev/null +++ b/pybind_interface/custatevecex/pybind_main_custatevecex.cpp @@ -0,0 +1,74 @@ +// Copyright 2025 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "pybind_main_custatevecex.h" + +#include "../../lib/fuser_mqubit.h" +#include "../../lib/gates_cirq.h" +#include "../../lib/io.h" +#include "../../lib/multiprocess_custatevecex.h" +#include "../../lib/run_custatevecex.h" +#include "../../lib/simulator_custatevecex.h" + +namespace { + +qsim::MultiProcessCuStateVecEx mp; + +} // namespace { + +namespace qsim { + using Simulator = SimulatorCuStateVecEx; + + struct Factory { + // num_sim_threads, num_state_threads and num_dblocks are unused, but kept + // for consistency with other factories. + Factory(unsigned num_sim_threads, + unsigned num_state_threads, + unsigned num_dblocks) { + if (!mp.initialized()) { + mp.initialize(); + } + } + + using Simulator = qsim::Simulator; + using StateSpace = Simulator::StateSpace; + + using Gate = Cirq::GateCirq; + using Runner = CuStateVecExRunner; + struct RunnerParameter : public Runner::Parameter { + // max_fused_size is not used, but kept for consistency. + unsigned max_fused_size = 2; + }; + using NoisyRunner = qsim::QuantumTrajectorySimulator; + struct NoisyRunnerParameter : public NoisyRunner::Parameter { + // max_fused_size is not used, but kept for consistency. + unsigned max_fused_size = 2; + }; + + StateSpace CreateStateSpace() const { + return StateSpace{mp}; + } + + Simulator CreateSimulator() const { + return Simulator{}; + } + }; + + inline void SetFlushToZeroAndDenormalsAreZeros() {} + inline void ClearFlushToZeroAndDenormalsAreZeros() {} +} + +#include "../pybind_main.cpp" diff --git a/pybind_interface/custatevecex/pybind_main_custatevecex.h b/pybind_interface/custatevecex/pybind_main_custatevecex.h new file mode 100644 index 000000000..06290dda5 --- /dev/null +++ b/pybind_interface/custatevecex/pybind_main_custatevecex.h @@ -0,0 +1,17 @@ +// Copyright 2025 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "../pybind_main.h" + +PYBIND11_MODULE(qsim_custatevecex, m) { GPU_MODULE_BINDINGS } diff --git a/pybind_interface/decide/CMakeLists.txt b/pybind_interface/decide/CMakeLists.txt index 0c8b8d941..b48e0e587 100644 --- a/pybind_interface/decide/CMakeLists.txt +++ b/pybind_interface/decide/CMakeLists.txt @@ -24,7 +24,7 @@ if(WIN32) # This prevents a conflict with /RTC1 in DEBUG builds. add_compile_options($<$>:/O2>) else() - add_compile_options(-O3 -flto=auto) + add_compile_options(-O3 -fno-lto) endif() if(APPLE) @@ -52,6 +52,9 @@ if(CMAKE_CUDA_COMPILER) target_compile_options(qsim_decide PRIVATE $<$:-D__CUSTATEVEC__> ) + target_compile_options(qsim_decide PRIVATE + $<$:-D__CUSTATEVECEX__> + ) endif() find_package(Python3 3.10 REQUIRED COMPONENTS Interpreter Development) include_directories(${PYTHON_INCLUDE_DIRS} ${pybind11_SOURCE_DIR}/include) diff --git a/pybind_interface/decide/decide.cpp b/pybind_interface/decide/decide.cpp index b40f6975e..51cd52425 100644 --- a/pybind_interface/decide/decide.cpp +++ b/pybind_interface/decide/decide.cpp @@ -61,7 +61,8 @@ int detect_instructions() { } enum GPUCapabilities { - CUDA = 0, CUSTATEVEC = 1, HIP = 2, NO_GPU = 10, NO_CUSTATEVEC = 11 }; + CUDA = 0, CUSTATEVEC = 1, CUSTATEVECEX = 2, HIP = 3, NO_GPU = 10, + NO_CUSTATEVEC = 11, NO_CUSTATEVECEX = 12 }; // For now, GPU detection is performed at compile time, as our wheels are // generated on Github Actions runners which do not have GPU support. @@ -93,6 +94,20 @@ int detect_custatevec() { return gpu; } +// For now, cuStateVecEx detection is performed at compile time, as our wheels +// are generated on Github Actions runners which do not have GPU support. +// +// Users wishing to use qsim with cuStateVecEx will need to compile locally on +// a device which has the necessary CUDA toolkit and cuStateVecEx library. +int detect_custatevecex() { + #if defined(__NVCC__) && defined(__CUSTATEVECEX__) + GPUCapabilities gpu = CUSTATEVECEX; + #else + GPUCapabilities gpu = NO_CUSTATEVECEX; + #endif + return gpu; +} + PYBIND11_MODULE(qsim_decide, m) { m.doc() = "pybind11 plugin"; // optional module docstring @@ -104,4 +119,7 @@ PYBIND11_MODULE(qsim_decide, m) { // Detect cuStateVec. m.def("detect_custatevec", &detect_custatevec, "Detect cuStateVec"); + + // Detect cuStateVecEx. + m.def("detect_custatevecex", &detect_custatevecex, "Detect cuStateVecEx"); } diff --git a/pybind_interface/hip/CMakeLists.txt b/pybind_interface/hip/CMakeLists.txt index 56f0cd0e6..4cad3230c 100644 --- a/pybind_interface/hip/CMakeLists.txt +++ b/pybind_interface/hip/CMakeLists.txt @@ -21,7 +21,7 @@ if(WIN32) # This prevents a conflict with /RTC1 in DEBUG builds. add_compile_options($<$>:/O2>) else() - add_compile_options(-O3 -flto=auto) + add_compile_options(-O3 -fno-lto) endif() include(../GetPybind11.cmake) diff --git a/qsimcirq/__init__.py b/qsimcirq/__init__.py index 7de9cc506..4a91936c6 100644 --- a/qsimcirq/__init__.py +++ b/qsimcirq/__init__.py @@ -35,7 +35,7 @@ def _load_qsim_gpu(): instr = qsim_decide.detect_gpu() if instr == 0: qsim_gpu = importlib.import_module("qsimcirq.qsim_cuda") - elif instr == 2: + elif instr == 3: qsim_gpu = importlib.import_module("qsimcirq.qsim_hip") else: qsim_gpu = None @@ -51,9 +51,19 @@ def _load_qsim_custatevec(): return qsim_custatevec +def _load_qsim_custatevecex(): + instr = qsim_decide.detect_custatevecex() + if instr == 2: + qsim_custatevecex = importlib.import_module("qsimcirq.qsim_custatevecex") + else: + qsim_custatevecex = None + return qsim_custatevecex + + qsim = _load_simd_qsim() qsim_gpu = _load_qsim_gpu() qsim_custatevec = _load_qsim_custatevec() +qsim_custatevecex = _load_qsim_custatevecex() # Note: the following imports must remain at the bottom of this file. diff --git a/qsimcirq/qsim_simulator.py b/qsimcirq/qsim_simulator.py index b3c0106fc..240715639 100644 --- a/qsimcirq/qsim_simulator.py +++ b/qsimcirq/qsim_simulator.py @@ -21,7 +21,7 @@ import qsimcirq.qsim_circuit as qsimc -from . import qsim, qsim_custatevec, qsim_gpu +from . import qsim, qsim_custatevec, qsim_custatevecex, qsim_gpu # This should probably live in Cirq... @@ -60,9 +60,10 @@ class QSimOptions: simulation modes. use_gpu: whether to use GPU instead of CPU for simulation. The "gpu_*" arguments below are only considered if this is set to True. - gpu_mode: use CUDA if set to 0 (default value) or use the NVIDIA - cuStateVec library if set to any other value. The "gpu_*" - arguments below are only considered if this is set to 0. + gpu_mode: use CUDA if set to 0 (default value), use the NVIDIA + cuStateVec library if set to 1 or use the NVIDIA cuStateVecEx + library if set to any other value. The "gpu_*" arguments below are + only considered if this is set to 0. gpu_state_threads: number of threads per CUDA block to use for the GPU StateSpace. This must be a power of 2 in the range [32, 1024]. gpu_data_blocks: number of data blocks to use for the GPU StateSpace. @@ -180,16 +181,26 @@ def __init__( ) else: self._sim_module = qsim_gpu - else: + elif self.qsim_options["gmode"] == 1: if qsim_custatevec is None: raise ValueError( "cuStateVec GPU execution requested, but not " "supported. If your device has GPU support and the " - "NVIDIA cuStateVec library is installed, you may need " - "to compile qsim locally." + "NVIDIA cuStateVec library is installed, you may " + "need to compile qsim locally." ) else: self._sim_module = qsim_custatevec + else: + if qsim_custatevecex is None: + raise ValueError( + "cuStateVecEx GPU execution requested, but not " + "supported. If your device has GPU support and the " + "NVIDIA cuStateVecEx library is installed, you may " + "need to compile qsim locally." + ) + else: + self._sim_module = qsim_custatevecex else: self._sim_module = qsim diff --git a/qsimcirq_tests/qsimcirq_test.py b/qsimcirq_tests/qsimcirq_test.py index 37573b9c7..0b5cdbc8e 100644 --- a/qsimcirq_tests/qsimcirq_test.py +++ b/qsimcirq_tests/qsimcirq_test.py @@ -1533,6 +1533,115 @@ def test_qsim_custatevec_input_state(): assert cirq.approx_eq(state_vector[i], 0, atol=1e-6) +def test_cirq_qsim_custatevecex_amplitudes(): + if qsimcirq.qsim_custatevecex is None: + pytest.skip("cuStateVecEx library is not available for testing.") + # Pick qubits. + a, b = [cirq.GridQubit(0, 0), cirq.GridQubit(0, 1)] + + # Create a circuit + cirq_circuit = cirq.Circuit(cirq.CNOT(a, b), cirq.CNOT(b, a), cirq.X(a)) + + # Enable GPU acceleration. + custatevecex_options = qsimcirq.QSimOptions(use_gpu=True, gpu_mode=2) + qsimGpuSim = qsimcirq.QSimSimulator(qsim_options=custatevecex_options) + result = qsimGpuSim.compute_amplitudes( + cirq_circuit, bitstrings=[0b00, 0b01, 0b10, 0b11] + ) + assert np.allclose(result, [0j, 0j, (1 + 0j), 0j]) + + +def test_cirq_qsim_custatevecex_simulate(): + if qsimcirq.qsim_custatevecex is None: + pytest.skip("cuStateVecEx library is not available for testing.") + # Pick qubits. + a, b = [cirq.GridQubit(0, 0), cirq.GridQubit(0, 1)] + + # Create a circuit + cirq_circuit = cirq.Circuit(cirq.H(a), cirq.CNOT(a, b), cirq.X(b)) + + # Enable GPU acceleration. + custatevecex_options = qsimcirq.QSimOptions(use_gpu=True, gpu_mode=2) + qsimGpuSim = qsimcirq.QSimSimulator(qsim_options=custatevecex_options) + result = qsimGpuSim.simulate(cirq_circuit) + assert result.state_vector().shape == (4,) + + cirqSim = cirq.Simulator() + cirq_result = cirqSim.simulate(cirq_circuit) + assert cirq.linalg.allclose_up_to_global_phase( + result.state_vector(), cirq_result.state_vector(), atol=1.0e-6 + ) + + +def test_cirq_qsim_custatevecex_expectation_values(): + if qsimcirq.qsim_custatevecex is None: + pytest.skip("cuStateVecEx library is not available for testing.") + # Pick qubits. + a, b = [cirq.GridQubit(0, 0), cirq.GridQubit(0, 1)] + + # Create a circuit + cirq_circuit = cirq.Circuit(cirq.H(a), cirq.CNOT(a, b), cirq.X(b)) + obs = [cirq.Z(a) * cirq.Z(b)] + + # Enable GPU acceleration. + custatevecex_options = qsimcirq.QSimOptions(use_gpu=True, gpu_mode=2) + qsimGpuSim = qsimcirq.QSimSimulator(qsim_options=custatevecex_options) + result = qsimGpuSim.simulate_expectation_values(cirq_circuit, obs) + + cirqSim = cirq.Simulator() + cirq_result = cirqSim.simulate_expectation_values(cirq_circuit, obs) + assert np.allclose(result, cirq_result) + + +def test_cirq_qsim_custatevecex_input_state(): + if qsimcirq.qsim_custatevecex is None: + pytest.skip("cuStateVecEx library is not available for testing.") + # Pick qubits. + a, b = [cirq.GridQubit(0, 0), cirq.GridQubit(0, 1)] + + # Create a circuit + cirq_circuit = cirq.Circuit(cirq.H(a), cirq.CNOT(a, b), cirq.X(b)) + + # Enable GPU acceleration. + custatevecex_options = qsimcirq.QSimOptions(use_gpu=True, gpu_mode=2) + qsimGpuSim = qsimcirq.QSimSimulator(qsim_options=custatevecex_options) + initial_state = np.asarray([0.5] * 4, dtype=np.complex64) + result = qsimGpuSim.simulate(cirq_circuit, initial_state=initial_state) + assert result.state_vector().shape == (4,) + + cirqSim = cirq.Simulator() + cirq_result = cirqSim.simulate(cirq_circuit, initial_state=initial_state) + assert cirq.linalg.allclose_up_to_global_phase( + result.state_vector(), cirq_result.state_vector(), atol=1.0e-6 + ) + + +def test_qsim_custatevecex_input_state(): + if qsimcirq.qsim_custatevecex is None: + pytest.skip("cuStateVecEx library is not available for testing.") + + for num_qubits in range(1, 8): + size = 2**num_qubits + qubits = cirq.LineQubit.range(num_qubits) + circuit = cirq.Circuit() + + for k in range(num_qubits): + circuit.append(cirq.H(qubits[k])) + + # Enable GPU acceleration. + custatevecex_options = qsimcirq.QSimOptions(use_gpu=True, gpu_mode=2) + qsimGpuSim = qsimcirq.QSimSimulator(qsim_options=custatevecex_options) + initial_state = np.asarray([np.sqrt(1.0 / size)] * size, dtype=np.complex64) + result = qsimGpuSim.simulate(circuit, initial_state=initial_state) + state_vector = result.state_vector() + + assert result.state_vector().shape == (size,) + assert cirq.approx_eq(state_vector[0], 1, atol=1e-6) + + for i in range(1, size): + assert cirq.approx_eq(state_vector[i], 0, atol=1e-6) + + def test_cirq_qsim_old_options(): old_options = {"f": 3, "t": 4, "r": 100, "v": 1} old_sim = qsimcirq.QSimSimulator(qsim_options=old_options) diff --git a/setup.py b/setup.py index 328b27cdd..b1c2b54d3 100644 --- a/setup.py +++ b/setup.py @@ -167,6 +167,7 @@ def build_extension(self, ext): CMakeExtension("qsimcirq/qsim_basic"), CMakeExtension("qsimcirq/qsim_cuda"), CMakeExtension("qsimcirq/qsim_custatevec"), + CMakeExtension("qsimcirq/qsim_custatevecex"), CMakeExtension("qsimcirq/qsim_decide"), CMakeExtension("qsimcirq/qsim_hip"), ], diff --git a/tests/Makefile b/tests/Makefile index e09b5fc4b..46f35e492 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -47,6 +47,9 @@ CUDA_TARGETS := $(CUDA_FILES:%cuda_test.cu=%cuda_test.x) CUSTATEVEC_FILES := $(wildcard *custatevec_test.cu) CUSTATEVEC_TARGETS := $(CUSTATEVEC_FILES:%custatevec_test.cu=%custatevec_test.x) +CUSTATEVECEX_FILES := $(wildcard *custatevecex_test.cu) +CUSTATEVECEX_TARGETS := $(CUSTATEVECEX_FILES:%custatevecex_test.cu=%custatevecex_test.x) + HIP_FILES := $(wildcard *cuda_test.cu) HIP_TARGETS := $(HIP_FILES:%cuda_test.cu=%hip_test.x) @@ -66,6 +69,9 @@ cuda-tests: $(CUDA_TARGETS) .PHONY: custatevec-tests custatevec-tests: $(CUSTATEVEC_TARGETS) +.PHONY: custatevecex-tests +custatevecex-tests: $(CUSTATEVECEX_TARGETS) + .PHONY: hip-tests hip-tests: $(HIP_TARGETS) @@ -81,6 +87,14 @@ run-cuda-tests: | $(GTEST_DIR)/build cuda-tests run-custatevec-tests: | $(GTEST_DIR)/build custatevec-tests for exe in $(CUSTATEVEC_TARGETS); do if ! ./$$exe; then exit 1; fi; done +.PHONY: run-custatevecex-tests +run-custatevecex-tests: | $(GTEST_DIR)/build custatevecex-tests + for exe in $(CUSTATEVECEX_TARGETS); do if ! ./$$exe; then exit 1; fi; done + +.PHONY: run-custatevecex-mpi-tests +run-custatevecex-mpi-tests: | $(GTEST_DIR)/build custatevecex-tests + for exe in $(CUSTATEVECEX_TARGETS); do if ! mpirun -np 2 ./$$exe; then exit 1; fi; done + .PHONY: run-hip-tests run-hip-tests: | $(GTEST_DIR)/build hip-tests for exe in $(HIP_TARGETS); do if ! ./$$exe; then exit 1; fi; done @@ -100,6 +114,9 @@ $(GTEST_DIR)/build: %custatevec_test.x: %custatevec_test.cu $(GTEST_DIR)/build $(NVCC) -o ./$@ $< $(TESTFLAGS) $(NVCCFLAGS) $(CUSTATEVECFLAGS) +%custatevecex_test.x: %custatevecex_test.cu $(GTEST_DIR)/build + $(NVCC) -o ./$@ $< $(TESTFLAGS) $(NVCCFLAGS) $(CUSTATEVECFLAGS) + %hip_test.x: %cuda_test.cu $(GTEST_DIR)/build $(HIPCC) -o ./$@ $< $(TESTFLAGS) $(HIPCCFLAGS) diff --git a/tests/hybrid_custatevecex_test.cu b/tests/hybrid_custatevecex_test.cu new file mode 100644 index 000000000..a0c75b031 --- /dev/null +++ b/tests/hybrid_custatevecex_test.cu @@ -0,0 +1,59 @@ +// Copyright 2025 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hybrid_testfixture.h" + +#include + +#include "gtest/gtest.h" + +#include "../lib/multiprocess_custatevecex.h" +#include "../lib/simulator_custatevecex.h" + +namespace qsim { + +MultiProcessCuStateVecEx mp; + +template +struct Factory { + using fp_type = FP; + using Simulator = qsim::SimulatorCuStateVecEx; + using StateSpace = typename Simulator::StateSpace; + + StateSpace CreateStateSpace() const { + return StateSpace{mp}; + } + + Simulator CreateSimulator() const { + return Simulator{}; + } +}; + +TEST(HybridCuStateVecExTest, Hybrid2) { + TestHybrid2(qsim::Factory()); +} + +TEST(HybridCuStateVecExTest, Hybrid4) { + TestHybrid4(qsim::Factory()); +} + +} // namespace qsim + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + + qsim::mp.initialize(); + + return RUN_ALL_TESTS(); +} diff --git a/tests/qtrajectory_custatevecex_test.cu b/tests/qtrajectory_custatevecex_test.cu new file mode 100644 index 000000000..8d70bfc00 --- /dev/null +++ b/tests/qtrajectory_custatevecex_test.cu @@ -0,0 +1,88 @@ +// Copyright 2025 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "qtrajectory_testfixture.h" + +#include + +#include "gtest/gtest.h" + +#include "../lib/io.h" +#include "../lib/multiprocess_custatevecex.h" +#include "../lib/run_custatevecex.h" +#include "../lib/simulator_custatevecex.h" + +namespace qsim { + +MultiProcessCuStateVecEx mp; + +template +struct Factory { + using fp_type = FP; + using Simulator = qsim::SimulatorCuStateVecEx; + using StateSpace = typename Simulator::StateSpace; + + StateSpace CreateStateSpace() const { + return StateSpace{mp}; + } + + Simulator CreateSimulator() const { + return Simulator{}; + } +}; + +TEST(QTrajectoryCuStateVecExTest, BitFlip) { + using Runner = CuStateVecExRunner>; + TestBitFlip(Factory()); +} + +TEST(QTrajectoryCuStateVecExTest, GenDump) { + using Runner = CuStateVecExRunner>; + TestGenDump(Factory()); +} + +TEST(QTrajectoryCuStateVecExTest, ReusingResults) { + using Runner = CuStateVecExRunner>; + TestReusingResults(Factory()); +} + +TEST(QTrajectoryCuStateVecExTest, CollectKopStat) { + using Runner = CuStateVecExRunner>; + TestCollectKopStat(Factory()); +} + +TEST(QTrajectoryCuStateVecExTest, CleanCircuit) { + using Runner = CuStateVecExRunner>; + TestCleanCircuit(Factory()); +} + +TEST(QTrajectoryCuStateVecExTest, InitialState) { + using Runner = CuStateVecExRunner>; + TestInitialState(Factory()); +} + +TEST(QTrajectoryCuStateVecExTest, UncomputeFinalState) { + using Runner = CuStateVecExRunner>; + TestUncomputeFinalState(Factory()); +} + +} // namespace qsim + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + + qsim::mp.initialize(); + + return RUN_ALL_TESTS(); +} diff --git a/tests/run_custatevecex_test.cu b/tests/run_custatevecex_test.cu new file mode 100644 index 000000000..079fd2696 --- /dev/null +++ b/tests/run_custatevecex_test.cu @@ -0,0 +1,262 @@ +// Copyright 2025 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "gates_cirq_testfixture.h" + +#include + +#include "gtest/gtest.h" + +#include "../lib/circuit_qsim_parser.h" +#include "../lib/gates_qsim.h" +#include "../lib/io.h" +#include "../lib/multiprocess_custatevecex.h" +#include "../lib/run_custatevecex.h" +#include "../lib/simulator_custatevecex.h" + +namespace qsim { + +constexpr char provider[] = "run_custatevecex_test"; + +constexpr char circuit_string[] = +R"(4 +0 h 0 +0 h 1 +0 h 2 +0 h 3 +1 cz 0 1 +1 cz 2 3 +2 t 0 +2 x 1 +2 y 2 +2 t 3 +3 y 0 +3 cz 1 2 +3 x 3 +4 t 1 +4 t 2 +5 cz 1 2 +6 x 1 +6 y 2 +7 cz 1 2 +8 t 1 +8 t 2 +9 cz 0 1 +9 cz 2 3 +10 h 0 +10 h 1 +10 h 2 +10 h 3 +)"; + +MultiProcessCuStateVecEx mp; + +struct Factory { + using Simulator = qsim::SimulatorCuStateVecEx; + using StateSpace = typename Simulator::StateSpace; + + StateSpace CreateStateSpace() const { + return StateSpace{mp}; + } + + Simulator CreateSimulator() const { + return Simulator{}; + } +}; + +TEST(RunQSimTest, QSimRunner1) { + std::stringstream ss(circuit_string); + Circuit> circuit; + + EXPECT_TRUE(CircuitQsimParser::FromStream(99, provider, ss, circuit)); + EXPECT_EQ(circuit.num_qubits, 4); + EXPECT_EQ(circuit.gates.size(), 27); + + using Simulator = Factory::Simulator; + using StateSpace = Simulator::StateSpace; + using State = StateSpace::State; + using Runner = CuStateVecExRunner; + + float entropy = 0; + + auto measure = [&entropy]( + unsigned k, const StateSpace& state_space, const State& state) { + // Calculate entropy. + + entropy = 0; + auto size = uint64_t{1} << state.num_qubits(); + + for (uint64_t i = 0; i < size; ++i) { + auto ampl = state_space.GetAmpl(state, i); + float p = std::norm(ampl); + entropy -= p * std::log(p); + } + }; + + Runner::Parameter param; + param.seed = 1; + param.verbosity = 0; + + EXPECT_TRUE(Runner::Run(param, Factory(), circuit, measure)); + + EXPECT_NEAR(entropy, 2.2192848, 1e-6); +} + +TEST(RunQSimTest, QSimRunner2) { + std::stringstream ss(circuit_string); + Circuit> circuit; + + EXPECT_TRUE(CircuitQsimParser::FromStream(99, provider, ss, circuit)); + EXPECT_EQ(circuit.num_qubits, 4); + EXPECT_EQ(circuit.gates.size(), 27); + + using Simulator = Factory::Simulator; + using StateSpace = Simulator::StateSpace; + using State = StateSpace::State; + using Runner = CuStateVecExRunner; + + Factory factory; + StateSpace state_space = factory.CreateStateSpace(); + State state = state_space.Create(circuit.num_qubits); + + EXPECT_FALSE(state_space.IsNull(state)); + + state_space.SetStateZero(state); + + Runner::Parameter param; + param.seed = 1; + param.verbosity = 0; + + EXPECT_TRUE(Runner::Run(param, Factory(), circuit, state)); + + // Calculate entropy. + + float entropy = 0; + auto size = uint64_t{1} << circuit.num_qubits; + + for (uint64_t i = 0; i < size; ++i) { + auto ampl = state_space.GetAmpl(state, i); + float p = std::norm(ampl); + entropy -= p * std::log(p); + } + + EXPECT_NEAR(entropy, 2.2192848, 1e-6); +} + +constexpr char sample_circuit_string[] = +R"(2 +0 h 0 +0 x 1 +1 m 1 +2 cx 0 1 +3 m 0 1 +4 m 0 +5 cx 1 0 +6 m 0 +7 x 0 +7 h 1 +8 m 0 1 +)"; + +TEST(RunQSimTest, QSimSampler) { + std::stringstream ss(sample_circuit_string); + Circuit> circuit; + + EXPECT_TRUE(CircuitQsimParser::FromStream(99, provider, ss, circuit)); + EXPECT_EQ(circuit.num_qubits, 2); + EXPECT_EQ(circuit.gates.size(), 11); + + using Simulator = Factory::Simulator; + using StateSpace = Simulator::StateSpace; + using Result = StateSpace::MeasurementResult; + using State = StateSpace::State; + using Runner = CuStateVecExRunner; + + Factory factory; + StateSpace state_space = factory.CreateStateSpace(); + State state = state_space.Create(circuit.num_qubits); + + EXPECT_FALSE(state_space.IsNull(state)); + + state_space.SetStateZero(state); + + std::vector results; + + Runner::Parameter param; + param.seed = 1; + param.verbosity = 0; + + EXPECT_TRUE(Runner::Run(param, Factory(), circuit, state, results)); + + // Results should contain (qubit @ time): + // (1 @ 1) - should be |01) + EXPECT_TRUE(results[0].bitstring[0]); + // (0 @ 3), (1 @ 3) - either |01) or |10) + EXPECT_EQ(results[1].bitstring[0], !results[1].bitstring[1]); + // (0 @ 4) - should match (0 @ 3) + EXPECT_EQ(results[1].bitstring[0], results[2].bitstring[0]); + // (0 @ 6) - either |11) or |10) + EXPECT_TRUE(results[3].bitstring[0]); + // (0 @ 8), (1 @ 8) - should be |00) + EXPECT_FALSE(results[4].bitstring[0]); + EXPECT_FALSE(results[4].bitstring[1]); +} + +TEST(RunQSimTest, CirqGates) { + auto circuit = CirqCircuit1::GetCircuit(false); + const auto& expected_results = CirqCircuit1::expected_results0; + + using Simulator = Factory::Simulator; + using StateSpace = Simulator::StateSpace; + using State = StateSpace::State; + using Runner = CuStateVecExRunner; + + Factory factory; + StateSpace state_space = factory.CreateStateSpace(); + State state = state_space.Create(circuit.num_qubits); + + auto size = uint64_t{1} << circuit.num_qubits; + + EXPECT_FALSE(state_space.IsNull(state)); + EXPECT_EQ(size, expected_results.size()); + + state_space.SetStateZero(state); + + Runner::Parameter param; + param.seed = 1; + param.verbosity = 0; + + EXPECT_TRUE(Runner::Run(param, Factory(), circuit, state)); + + for (uint64_t i = 0; i < size; ++i) { + auto ampl = state_space.GetAmpl(state, i); + EXPECT_NEAR(std::real(ampl), std::real(expected_results[i]), 2e-6); + EXPECT_NEAR(std::imag(ampl), std::imag(expected_results[i]), 2e-6); + } +} + +} // namespace qsim + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + + qsim::mp.initialize(); + + return RUN_ALL_TESTS(); +} diff --git a/tests/simulator_custatevecex_test.cu b/tests/simulator_custatevecex_test.cu new file mode 100644 index 000000000..dcf9eaf65 --- /dev/null +++ b/tests/simulator_custatevecex_test.cu @@ -0,0 +1,105 @@ +// Copyright 2025 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "simulator_testfixture.h" + +#include + +#include + +#include "gtest/gtest.h" + +#include "../lib/multiprocess_custatevecex.h" +#include "../lib/simulator_custatevecex.h" + +namespace qsim { + +template +class SimulatorCuStateVecExTest : public testing::Test {}; + +//using fp_impl = ::testing::Types; +using fp_impl = ::testing::Types; + +TYPED_TEST_SUITE(SimulatorCuStateVecExTest, fp_impl); + +MultiProcessCuStateVecEx mp; + +template +struct Factory { + using Simulator = qsim::SimulatorCuStateVecEx; + using StateSpace = typename Simulator::StateSpace; + + StateSpace CreateStateSpace() const { + return StateSpace{mp}; + } + + Simulator CreateSimulator() const { + return Simulator{}; + } +}; + +TYPED_TEST(SimulatorCuStateVecExTest, ApplyGate1) { + TestApplyGate1(qsim::Factory()); +} + +TYPED_TEST(SimulatorCuStateVecExTest, ApplyGate2) { + TestApplyGate2(qsim::Factory()); +} + +TYPED_TEST(SimulatorCuStateVecExTest, ApplyGate3) { + TestApplyGate3(qsim::Factory()); +} + +TYPED_TEST(SimulatorCuStateVecExTest, ApplyGate5) { + TestApplyGate5(qsim::Factory()); +} + +TYPED_TEST(SimulatorCuStateVecExTest, CircuitWithControlledGates) { + TestCircuitWithControlledGates(qsim::Factory()); +} + +TYPED_TEST(SimulatorCuStateVecExTest, CircuitWithControlledGatesDagger) { + TestCircuitWithControlledGatesDagger(qsim::Factory()); +} + +TYPED_TEST(SimulatorCuStateVecExTest, MultiQubitGates) { + TestMultiQubitGates(qsim::Factory()); +} + +TYPED_TEST(SimulatorCuStateVecExTest, ControlledGates) { + bool high_precision = std::is_same::value; + TestControlledGates(qsim::Factory(), high_precision, true); +} + +TYPED_TEST(SimulatorCuStateVecExTest, GlobalPhaseGate) { + TestGlobalPhaseGate(qsim::Factory()); +} + +TYPED_TEST(SimulatorCuStateVecExTest, ExpectationValue1) { + TestExpectationValue1(qsim::Factory()); +} + +TYPED_TEST(SimulatorCuStateVecExTest, ExpectationValue2) { + TestExpectationValue2(qsim::Factory()); +} + +} // namespace qsim + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + + qsim::mp.initialize(); + + return RUN_ALL_TESTS(); +} diff --git a/tests/simulator_testfixture.h b/tests/simulator_testfixture.h index 2e15c287f..bf46353a7 100644 --- a/tests/simulator_testfixture.h +++ b/tests/simulator_testfixture.h @@ -362,7 +362,8 @@ void TestCircuitWithControlledGates(const Factory& factory) { using fp_type = typename StateSpace::fp_type; using Gate = GateQSim; - unsigned num_qubits = 6; + unsigned num_qubits = 7; + unsigned size = 1 << (num_qubits - 1); std::vector gates; gates.reserve(128); @@ -722,10 +723,8 @@ if __name__ == '__main__': {-0.18774915, 0.12311842}, }; - unsigned size = 1 << num_qubits; - for (unsigned i = 0; i < size; ++i) { - auto a = StateSpace::GetAmpl(state1, i); + auto a = state_space.GetAmpl(state1, i); EXPECT_NEAR(std::real(a), expected_results[i][0], 1e-6); EXPECT_NEAR(std::imag(a), expected_results[i][1], 1e-6); } @@ -740,8 +739,8 @@ if __name__ == '__main__': } for (unsigned i = 0; i < size; ++i) { - auto a1 = StateSpace::GetAmpl(state1, i); - auto a2 = StateSpace::GetAmpl(state2, i); + auto a1 = state_space.GetAmpl(state1, i); + auto a2 = state_space.GetAmpl(state2, i); EXPECT_EQ(std::real(a1), std::real(a2)); EXPECT_EQ(std::imag(a1), std::imag(a2)); } @@ -756,8 +755,8 @@ if __name__ == '__main__': } for (unsigned i = 0; i < size; ++i) { - auto a1 = StateSpace::GetAmpl(state1, i); - auto a2 = StateSpace::GetAmpl(state3, i); + auto a1 = state_space.GetAmpl(state1, i); + auto a2 = state_space.GetAmpl(state3, i); EXPECT_EQ(std::real(a1), std::real(a2)); EXPECT_EQ(std::imag(a1), std::imag(a2)); } @@ -770,8 +769,8 @@ void TestCircuitWithControlledGatesDagger(const Factory& factory) { using fp_type = typename StateSpace::fp_type; using Gate = GateQSim; - unsigned num_qubits = 6; - unsigned size = 1 << num_qubits; + unsigned num_qubits = 7; + unsigned size = 1 << (num_qubits - 1); std::vector gates; gates.reserve(128); @@ -1133,10 +1132,10 @@ if __name__ == '__main__': */ - EXPECT_NEAR(std::real(StateSpace::GetAmpl(state, 0)), 1, 1e-6); - EXPECT_NEAR(std::imag(StateSpace::GetAmpl(state, 0)), 0, 1e-6); + EXPECT_NEAR(std::real(state_space.GetAmpl(state, 0)), 1, 1e-6); + EXPECT_NEAR(std::imag(state_space.GetAmpl(state, 0)), 0, 1e-6); for (unsigned i = 1; i < size; ++i) { - auto a = StateSpace::GetAmpl(state, i); + auto a = state_space.GetAmpl(state, i); EXPECT_NEAR(std::real(a), 0, 1e-6); EXPECT_NEAR(std::imag(a), 0, 1e-6); } @@ -1162,14 +1161,14 @@ void TestMultiQubitGates(const Factory& factory) { std::vector vec(state_space.MinSize(max_num_qubits)); - for (unsigned num_qubits = 1; num_qubits <= max_num_qubits; ++num_qubits) { + for (unsigned num_qubits = 2; num_qubits <= max_num_qubits; ++num_qubits) { auto state = state_space.Create(num_qubits); unsigned size = 1 << num_qubits; fp_type inorm = std::sqrt(1.0 / (1 << num_qubits)); - unsigned max_gate_qubits2 = std::min(max_gate_qubits, num_qubits); + unsigned max_gate_qubits2 = std::min(max_gate_qubits, num_qubits - 1); - for (unsigned q = 0; q <= max_gate_qubits2; ++q) { + for (unsigned q = 1; q <= max_gate_qubits2; ++q) { unsigned size1 = 1 << q; unsigned size2 = size1 * size1; @@ -1432,10 +1431,10 @@ void TestExpectationValue1(const Factory& factory) { std::vector vec(state_space.MinSize(max_num_qubits)); - for (unsigned num_qubits = 1; num_qubits <= max_num_qubits; ++num_qubits) { + for (unsigned num_qubits = 2; num_qubits <= max_num_qubits; ++num_qubits) { auto state = state_space.Create(num_qubits); - unsigned max_gate_qubits2 = std::min(max_gate_qubits, num_qubits); + unsigned max_gate_qubits2 = std::min(max_gate_qubits, num_qubits - 1); for (unsigned q = 1; q <= max_gate_qubits2; ++q) { unsigned size1 = 1 << q; diff --git a/tests/statespace_custatevecex_test.cu b/tests/statespace_custatevecex_test.cu new file mode 100644 index 000000000..db840d7c9 --- /dev/null +++ b/tests/statespace_custatevecex_test.cu @@ -0,0 +1,119 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "statespace_testfixture.h" + +#include + +#include "gtest/gtest.h" + +#include "../lib/multiprocess_custatevecex.h" +#include "../lib/simulator_custatevecex.h" +#include "../lib/statespace_custatevecex.h" + +namespace qsim { + +template +class StateSpaceCuStateVecExTest : public testing::Test {}; + +using fp_impl = ::testing::Types; + +TYPED_TEST_SUITE(StateSpaceCuStateVecExTest, fp_impl); + +MultiProcessCuStateVecEx mp; + +template +struct Factory { + using Simulator = qsim::SimulatorCuStateVecEx; + using StateSpace = typename Simulator::StateSpace; + + StateSpace CreateStateSpace() const { + return StateSpace{mp}; + } + + Simulator CreateSimulator() const { + return Simulator{}; + } +}; + +TYPED_TEST(StateSpaceCuStateVecExTest, Add) { + TestAdd(qsim::Factory()); +} + +TYPED_TEST(StateSpaceCuStateVecExTest, NormSmall) { + TestNormSmall(qsim::Factory()); +} + +TYPED_TEST(StateSpaceCuStateVecExTest, NormAndInnerProductSmall) { + TestNormAndInnerProductSmall(qsim::Factory()); +} + +TYPED_TEST(StateSpaceCuStateVecExTest, NormAndInnerProduct) { + TestNormAndInnerProduct(qsim::Factory()); +} + +TYPED_TEST(StateSpaceCuStateVecExTest, SamplingSmall) { + TestSamplingSmall(qsim::Factory()); +} + +TYPED_TEST(StateSpaceCuStateVecExTest, SamplingCrossEntropyDifference) { + TestSamplingCrossEntropyDifference(qsim::Factory()); +} + +TYPED_TEST(StateSpaceCuStateVecExTest, Ordering) { + TestOrdering(qsim::Factory()); +} + +TEST(StateSpaceCuStateVecExTest, MeasurementSmall) { + TestMeasurementSmall(qsim::Factory(), true); +} + +TYPED_TEST(StateSpaceCuStateVecExTest, MeasurementLarge) { +// This test fails. +// TestMeasurementLarge(qsim::Factory()); +} + +TYPED_TEST(StateSpaceCuStateVecExTest, Collapse) { +// Not implemented. +// TestCollapse(qsim::Factory()); +} + +TEST(StateSpaceCuStateVecExTest, InvalidStateSize) { + TestInvalidStateSize(qsim::Factory()); +} + +TYPED_TEST(StateSpaceCuStateVecExTest, BulkSetAmpl) { +// Not implemented. +// TestBulkSetAmplitude(qsim::Factory()); +} + +TYPED_TEST(StateSpaceCuStateVecExTest, BulkSetAmplExclusion) { +// Not implemented. +// TestBulkSetAmplitudeExclusion(qsim::Factory()); +} + +TYPED_TEST(StateSpaceCuStateVecExTest, BulkSetAmplDefault) { +// Not implemented. +// TestBulkSetAmplitudeDefault(factory); +} + +} // namespace qsim + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + + qsim::mp.initialize(); + + return RUN_ALL_TESTS(); +} From 03ae3c2163f8181b98316d2fcb6e06af627848c4 Mon Sep 17 00:00:00 2001 From: Sergei Isakov <54642992+sergeisakov@users.noreply.github.com> Date: Wed, 17 Dec 2025 17:02:43 +0100 Subject: [PATCH 2/3] Address gemini-code-assist comments. --- apps/Makefile | 7 +++++-- apps/qsim_base_custatevecex.cu | 1 - lib/BUILD | 1 - lib/multiprocess_custatevecex.h | 4 ++-- lib/run_custatevecex.h | 9 +++++---- lib/simulator_custatevec.h | 17 +++++++++++------ lib/simulator_custatevecex.h | 20 +++++++++++++------- lib/statespace_custatevec.h | 6 ++++-- lib/statespace_custatevecex.h | 4 ++-- lib/vectorspace_custatevecex.h | 5 ++--- 10 files changed, 44 insertions(+), 30 deletions(-) diff --git a/apps/Makefile b/apps/Makefile index fd18635fc..58ab53054 100644 --- a/apps/Makefile +++ b/apps/Makefile @@ -7,8 +7,8 @@ CUDA_TARGETS := $(CUDA_TARGETS:%cuda.cu=%cuda.x) CUSTATEVEC_TARGETS = $(shell find . -maxdepth 1 -name "*custatevec.cu") CUSTATEVEC_TARGETS := $(CUSTATEVEC_TARGETS:%custatevec.cu=%custatevec.x) -CUSTATEVEC_TARGETS = $(shell find . -maxdepth 1 -name "*custatevecex.cu") -CUSTATEVEC_TARGETS := $(CUSTATEVEC_TARGETS:%custatevec.cu=%custatevecex.x) +CUSTATEVECEX_TARGETS = $(shell find . -maxdepth 1 -name "*custatevecex.cu") +CUSTATEVECEX_TARGETS := $(CUSTATEVEC_TARGETS:%custatevecex.cu=%custatevecex.x) HIP_TARGETS = $(shell find . -maxdepth 1 -name '*cuda.cu') HIP_TARGETS := $(HIP_TARGETS:%cuda.cu=%hip.x) @@ -22,6 +22,9 @@ qsim-cuda: $(CUDA_TARGETS) .PHONY: qsim-custatevec qsim-custatevec: $(CUSTATEVEC_TARGETS) +.PHONY: qsim-custatevecex +qsim-custatevecex: $(CUSTATEVECEX_TARGETS) + .PHONY: qsim-hip qsim-hip: $(HIP_TARGETS) diff --git a/apps/qsim_base_custatevecex.cu b/apps/qsim_base_custatevecex.cu index 8cc12fb80..99ce1a283 100644 --- a/apps/qsim_base_custatevecex.cu +++ b/apps/qsim_base_custatevecex.cu @@ -54,7 +54,6 @@ Options GetOptions(int argc, char* argv[]) { case 's': opt.seed = std::atoi(optarg); break; - break; case 'v': opt.verbosity = std::atoi(optarg); break; diff --git a/lib/BUILD b/lib/BUILD index 50ef51491..60fd0e51a 100644 --- a/lib/BUILD +++ b/lib/BUILD @@ -561,7 +561,6 @@ cuda_library( ":util_cuda", ":util_custatevec", ":util_custatevecex", - ":vectorspace_custatevecex", ], ) diff --git a/lib/multiprocess_custatevecex.h b/lib/multiprocess_custatevecex.h index 001498aac..9c4a13bb1 100644 --- a/lib/multiprocess_custatevecex.h +++ b/lib/multiprocess_custatevecex.h @@ -162,8 +162,8 @@ struct MultiProcessCuStateVecEx { ErrorCheck(custatevecExConfigureStateVectorMultiProcess( &sv_config, data_type, num_qubits, num_local_qubits, -1, memory_sharing_method_, global_index_bit_classes_.data(), - (int32_t*) num_global_qubits_per_layer_.data(), - (int32_t) global_index_bit_classes_.size(), + reinterpret_cast(num_global_qubits_per_layer_.data()), + static_cast(global_index_bit_classes_.size()), param_.transfer_buffer_size, nullptr, 0)); return sv_config; diff --git a/lib/run_custatevecex.h b/lib/run_custatevecex.h index 390f11013..2a2b8b1da 100644 --- a/lib/run_custatevecex.h +++ b/lib/run_custatevecex.h @@ -260,7 +260,8 @@ struct CuStateVecExRunner final { ErrorCheck(custatevecExSVUpdaterEnqueueMatrix( sv_updater, gate.matrix.data(), StateSpace::kMatrixDataType, StateSpace::kExMatrixType, StateSpace::kMatrixLayout, 0, - (int32_t*) gate.qubits.data(), num_qubits, nullptr, nullptr, 0)); + reinterpret_cast(gate.qubits.data()), + num_qubits, nullptr, nullptr, 0)); } } else { std::vector control_bits; @@ -273,9 +274,9 @@ struct CuStateVecExRunner final { ErrorCheck(custatevecExSVUpdaterEnqueueMatrix( sv_updater, gate.matrix.data(), StateSpace::kMatrixDataType, StateSpace::kExMatrixType, StateSpace::kMatrixLayout, 0, - (int32_t*) gate.qubits.data(), num_qubits, - (int32_t*) gate.controlled_by.data(), control_bits.data(), - num_cqubits)); + reinterpret_cast(gate.qubits.data()), num_qubits, + reinterpret_cast(gate.controlled_by.data()), + control_bits.data(), num_cqubits)); } if (times_to_measure_at.size() > 0) { diff --git a/lib/simulator_custatevec.h b/lib/simulator_custatevec.h index b3f3cb8fa..a13c6e1af 100644 --- a/lib/simulator_custatevec.h +++ b/lib/simulator_custatevec.h @@ -82,8 +82,9 @@ class SimulatorCuStateVec final { ErrorCheck(custatevecApplyMatrix( custatevec_handle_, state.get(), kStateType, state.num_qubits(), matrix, kMatrixType, kMatrixLayout, 0, - (int32_t*) qs.data(), qs.size(), nullptr, nullptr, 0, - kComputeType, workspace_, workspace_size)); + reinterpret_cast(qs.data()), qs.size(), + nullptr, nullptr, 0, kComputeType, workspace_, + workspace_size)); } } @@ -118,9 +119,10 @@ class SimulatorCuStateVec final { ErrorCheck(custatevecApplyMatrix( custatevec_handle_, state.get(), kStateType, state.num_qubits(), matrix, kMatrixType, kMatrixLayout, 0, - (int32_t*) qs.data(), qs.size(), - (int32_t*) cqs.data(), control_bits.data(), cqs.size(), - kComputeType, workspace_, workspace_size)); + reinterpret_cast(qs.data()), qs.size(), + reinterpret_cast(cqs.data()), + control_bits.data(), cqs.size(), kComputeType, + workspace_, workspace_size)); } } @@ -144,9 +146,12 @@ class SimulatorCuStateVec final { ErrorCheck(custatevecComputeExpectation( custatevec_handle_, state.get(), kStateType, state.num_qubits(), &eval, kExpectType, nullptr, matrix, - kMatrixType, kMatrixLayout, (int32_t*) qs.data(), qs.size(), + kMatrixType, kMatrixLayout, + reinterpret_cast(qs.data()), qs.size(), kComputeType, workspace_, workspace_size)); + ErrorCheck(cudaDeviceSynchronize()); + return {cuCreal(eval), cuCimag(eval)}; } diff --git a/lib/simulator_custatevecex.h b/lib/simulator_custatevecex.h index cb4526ec8..bcfb2c519 100644 --- a/lib/simulator_custatevecex.h +++ b/lib/simulator_custatevecex.h @@ -72,7 +72,8 @@ class SimulatorCuStateVecEx final { ErrorCheck(custatevecExApplyMatrix( state.get(), matrix, kMatrixDataType, kExMatrixType, kMatrixLayout, - 0, (int32_t*) qs.data(), qs.size(), nullptr, nullptr, 0)); + 0, reinterpret_cast(qs.data()), qs.size(), + nullptr, nullptr, 0)); } } @@ -112,8 +113,9 @@ class SimulatorCuStateVecEx final { ErrorCheck(custatevecExApplyMatrix( state.get(), matrix, kMatrixDataType, kExMatrixType, kMatrixLayout, - 0, (int32_t*) qs.data(), qs.size(), (int32_t*) cqs.data(), - control_bits.data(), cqs.size())); + 0, reinterpret_cast(qs.data()), qs.size(), + reinterpret_cast(cqs.data()), control_bits.data(), + cqs.size())); } } @@ -189,8 +191,8 @@ class SimulatorCuStateVecEx final { if (l > 0) { ErrorCheck(custatevecExStateVectorPermuteIndexBits( - state.get(), (int32_t*) perm.data(), num_qubits, - CUSTATEVEC_EX_PERMUTATION_SCATTER)); + state.get(), reinterpret_cast(perm.data()), + num_qubits, CUSTATEVEC_EX_PERMUTATION_SCATTER)); } auto f = [&matrix, &state, &num_local_qubits, &qs2]( @@ -198,6 +200,8 @@ class SimulatorCuStateVecEx final { void* workspace; size_t workspace_size; + ErrorCheck(cudaSetDevice(r.device_id)); + ErrorCheck(custatevecComputeExpectationGetWorkspaceSize( r.custatevec_handle, kStateDataType, num_local_qubits, matrix, kMatrixDataType, kMatrixLayout, qs2.size(), kComputeType, @@ -211,9 +215,11 @@ class SimulatorCuStateVecEx final { ErrorCheck(custatevecComputeExpectation( r.custatevec_handle, r.device_ptr, kStateDataType, num_local_qubits, &eval, kExpectDataType, nullptr, matrix, kMatrixDataType, - kMatrixLayout, (int32_t*) qs2.data(), qs2.size(), kComputeType, - workspace, workspace_size)); + kMatrixLayout, reinterpret_cast(qs2.data()), + qs2.size(), kComputeType, workspace, workspace_size)); + // TODO: make it faster. + ErrorCheck(custatevecExStateVectorSynchronize(state.get())); ErrorCheck(cudaFree(workspace)); return std::complex{cuCreal(eval), cuCimag(eval)}; diff --git a/lib/statespace_custatevec.h b/lib/statespace_custatevec.h index f2f5de107..6bd0a37d2 100644 --- a/lib/statespace_custatevec.h +++ b/lib/statespace_custatevec.h @@ -306,8 +306,10 @@ class StateSpaceCuStateVec : ErrorCheck(custatevecBatchMeasure( custatevec_handle_, state.get(), kStateType, - state.num_qubits(), (int*) result.bitstring.data(), - (int*) qubits.data(), qubits.size(), r, collapse)); + state.num_qubits(), + reinterpret_cast(result.bitstring.data()), + reinterpret_cast(qubits.data()), qubits.size(), + r, collapse)); for (std::size_t i = 0; i < result.bitstring.size(); ++i) { result.bits |= result.bitstring[i] << qubits[i]; diff --git a/lib/statespace_custatevecex.h b/lib/statespace_custatevecex.h index 8b27822c7..ce5cb0c3e 100644 --- a/lib/statespace_custatevecex.h +++ b/lib/statespace_custatevecex.h @@ -402,8 +402,8 @@ class StateSpaceCuStateVecEx : custatevecIndex_t bits; ErrorCheck(custatevecExMeasure( - state.get(), &bits, (int32_t*) qubits.data(), qubits.size(), - r, collapse, nullptr)); + state.get(), &bits, reinterpret_cast(qubits.data()), + qubits.size(), r, collapse, nullptr)); ErrorCheck(custatevecExStateVectorSynchronize(state.get())); for (std::size_t i = 0; i < qubits.size(); ++i) { diff --git a/lib/vectorspace_custatevecex.h b/lib/vectorspace_custatevecex.h index ed72c403b..3fa26a931 100644 --- a/lib/vectorspace_custatevecex.h +++ b/lib/vectorspace_custatevecex.h @@ -163,8 +163,7 @@ class VectorSpaceCuStateVecEx { const auto& get_wire_ordering() const { ErrorCheck(custatevecExStateVectorGetProperty( ptr_, CUSTATEVEC_EX_SV_PROP_WIRE_ORDERING, - const_cast(wire_ordering_.data()), - sizeof(int32_t) * num_qubits_)); + wire_ordering_.data(), sizeof(int32_t) * num_qubits_)); return wire_ordering_; } @@ -373,7 +372,7 @@ class VectorSpaceCuStateVecEx { private: const MultiProcessCuStateVecEx* mp_; custatevecExStateVectorDescriptor_t ptr_; - std::vector wire_ordering_; + mutable std::vector wire_ordering_; unsigned num_qubits_; unsigned num_substates_; DistributionType distr_type_; From e1705798b610152b9922e85a42e25957095d96e2 Mon Sep 17 00:00:00 2001 From: Sergei Isakov <54642992+sergeisakov@users.noreply.github.com> Date: Fri, 19 Dec 2025 17:03:05 +0100 Subject: [PATCH 3/3] Update make files. --- Makefile | 4 ++++ apps/Makefile | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 94a9bbcf9..4b37317c2 100644 --- a/Makefile +++ b/Makefile @@ -123,6 +123,10 @@ qsim-cuda: qsim-custatevec: | check-cuquantum-root-set $(MAKE) -C apps/ qsim-custatevec +.PHONY: qsim-custatevecex +qsim-custatevecex: | check-cuquantum-root-set + $(MAKE) -C apps/ qsim-custatevecex + .PHONY: qsim-hip qsim-hip: $(MAKE) -C apps/ qsim-hip diff --git a/apps/Makefile b/apps/Makefile index 58ab53054..19ccbc422 100644 --- a/apps/Makefile +++ b/apps/Makefile @@ -8,7 +8,7 @@ CUSTATEVEC_TARGETS = $(shell find . -maxdepth 1 -name "*custatevec.cu") CUSTATEVEC_TARGETS := $(CUSTATEVEC_TARGETS:%custatevec.cu=%custatevec.x) CUSTATEVECEX_TARGETS = $(shell find . -maxdepth 1 -name "*custatevecex.cu") -CUSTATEVECEX_TARGETS := $(CUSTATEVEC_TARGETS:%custatevecex.cu=%custatevecex.x) +CUSTATEVECEX_TARGETS := $(CUSTATEVECEX_TARGETS:%custatevecex.cu=%custatevecex.x) HIP_TARGETS = $(shell find . -maxdepth 1 -name '*cuda.cu') HIP_TARGETS := $(HIP_TARGETS:%cuda.cu=%hip.x)