-
Notifications
You must be signed in to change notification settings - Fork 218
Use usize for indices and dimensions
#343
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
55cd2bc
9a23b4d
deaa348
191ca97
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -107,69 +107,59 @@ macro_rules! launch { | |
| #[derive(Debug, Clone, PartialEq, Eq)] | ||
| pub struct GridSize { | ||
| /// Width of grid in blocks | ||
| pub x: u32, | ||
| pub x: usize, | ||
| /// Height of grid in blocks | ||
| pub y: u32, | ||
| pub y: usize, | ||
| /// Depth of grid in blocks | ||
| pub z: u32, | ||
| pub z: usize, | ||
| } | ||
| impl GridSize { | ||
| /// Create a one-dimensional grid of `x` blocks | ||
| #[inline] | ||
| pub fn x(x: u32) -> GridSize { | ||
| pub fn x(x: usize) -> GridSize { | ||
| GridSize { x, y: 1, z: 1 } | ||
| } | ||
|
|
||
| /// Create a two-dimensional grid of `x * y` blocks | ||
| #[inline] | ||
| pub fn xy(x: u32, y: u32) -> GridSize { | ||
| pub fn xy(x: usize, y: usize) -> GridSize { | ||
| GridSize { x, y, z: 1 } | ||
| } | ||
|
|
||
| /// Create a three-dimensional grid of `x * y * z` blocks | ||
| #[inline] | ||
| pub fn xyz(x: u32, y: u32, z: u32) -> GridSize { | ||
| pub fn xyz(x: usize, y: usize, z: usize) -> GridSize { | ||
| GridSize { x, y, z } | ||
| } | ||
| } | ||
| impl From<u32> for GridSize { | ||
| fn from(x: u32) -> GridSize { | ||
| impl From<usize> for GridSize { | ||
| fn from(x: usize) -> GridSize { | ||
| GridSize::x(x) | ||
| } | ||
| } | ||
| impl From<(u32, u32)> for GridSize { | ||
| fn from((x, y): (u32, u32)) -> GridSize { | ||
| impl From<(usize, usize)> for GridSize { | ||
| fn from((x, y): (usize, usize)) -> GridSize { | ||
| GridSize::xy(x, y) | ||
| } | ||
| } | ||
| impl From<(u32, u32, u32)> for GridSize { | ||
| fn from((x, y, z): (u32, u32, u32)) -> GridSize { | ||
| impl From<(usize, usize, usize)> for GridSize { | ||
| fn from((x, y, z): (usize, usize, usize)) -> GridSize { | ||
| GridSize::xyz(x, y, z) | ||
| } | ||
| } | ||
| impl<'a> From<&'a GridSize> for GridSize { | ||
| impl From<&GridSize> for GridSize { | ||
| fn from(other: &GridSize) -> GridSize { | ||
| other.clone() | ||
| } | ||
| } | ||
| impl From<glam::UVec2> for GridSize { | ||
| fn from(vec: glam::UVec2) -> Self { | ||
| GridSize::xy(vec.x, vec.y) | ||
| } | ||
| } | ||
| impl From<glam::UVec3> for GridSize { | ||
| fn from(vec: glam::UVec3) -> Self { | ||
| GridSize::xyz(vec.x, vec.y, vec.z) | ||
| } | ||
| } | ||
| impl From<glam::USizeVec2> for GridSize { | ||
| fn from(vec: glam::USizeVec2) -> Self { | ||
| GridSize::xy(vec.x as u32, vec.y as u32) | ||
| GridSize::xy(vec.x, vec.y) | ||
| } | ||
| } | ||
| impl From<glam::USizeVec3> for GridSize { | ||
| fn from(vec: glam::USizeVec3) -> Self { | ||
| GridSize::xyz(vec.x as u32, vec.y as u32, vec.z as u32) | ||
| GridSize::xyz(vec.x, vec.y, vec.z) | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -183,68 +173,58 @@ impl From<glam::USizeVec3> for GridSize { | |
| #[derive(Debug, Clone, PartialEq, Eq)] | ||
| pub struct BlockSize { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think these are defined by NVIDIA to be 32 bits?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, they are. But these are Rust-side equivalents of NVIDIA types. This isn't an FFI binding type or anything, so it doesn't need to be identical to the NVIDIA type. Lengths and indices are |
||
| /// X dimension of each thread block | ||
| pub x: u32, | ||
| pub x: usize, | ||
| /// Y dimension of each thread block | ||
| pub y: u32, | ||
| pub y: usize, | ||
| /// Z dimension of each thread block | ||
| pub z: u32, | ||
| pub z: usize, | ||
| } | ||
| impl BlockSize { | ||
| /// Create a one-dimensional block of `x` threads | ||
| #[inline] | ||
| pub fn x(x: u32) -> BlockSize { | ||
| pub fn x(x: usize) -> BlockSize { | ||
| BlockSize { x, y: 1, z: 1 } | ||
| } | ||
|
|
||
| /// Create a two-dimensional block of `x * y` threads | ||
| #[inline] | ||
| pub fn xy(x: u32, y: u32) -> BlockSize { | ||
| pub fn xy(x: usize, y: usize) -> BlockSize { | ||
| BlockSize { x, y, z: 1 } | ||
| } | ||
|
|
||
| /// Create a three-dimensional block of `x * y * z` threads | ||
| #[inline] | ||
| pub fn xyz(x: u32, y: u32, z: u32) -> BlockSize { | ||
| pub fn xyz(x: usize, y: usize, z: usize) -> BlockSize { | ||
| BlockSize { x, y, z } | ||
| } | ||
| } | ||
| impl From<u32> for BlockSize { | ||
| fn from(x: u32) -> BlockSize { | ||
| impl From<usize> for BlockSize { | ||
| fn from(x: usize) -> BlockSize { | ||
| BlockSize::x(x) | ||
| } | ||
| } | ||
| impl From<(u32, u32)> for BlockSize { | ||
| fn from((x, y): (u32, u32)) -> BlockSize { | ||
| impl From<(usize, usize)> for BlockSize { | ||
| fn from((x, y): (usize, usize)) -> BlockSize { | ||
| BlockSize::xy(x, y) | ||
| } | ||
| } | ||
| impl From<(u32, u32, u32)> for BlockSize { | ||
| fn from((x, y, z): (u32, u32, u32)) -> BlockSize { | ||
| impl From<(usize, usize, usize)> for BlockSize { | ||
| fn from((x, y, z): (usize, usize, usize)) -> BlockSize { | ||
| BlockSize::xyz(x, y, z) | ||
| } | ||
| } | ||
| impl<'a> From<&'a BlockSize> for BlockSize { | ||
| impl From<&BlockSize> for BlockSize { | ||
| fn from(other: &BlockSize) -> BlockSize { | ||
| other.clone() | ||
| } | ||
| } | ||
| impl From<glam::UVec2> for BlockSize { | ||
| fn from(vec: glam::UVec2) -> Self { | ||
| BlockSize::xy(vec.x, vec.y) | ||
| } | ||
| } | ||
| impl From<glam::UVec3> for BlockSize { | ||
| fn from(vec: glam::UVec3) -> Self { | ||
| BlockSize::xyz(vec.x, vec.y, vec.z) | ||
| } | ||
| } | ||
| impl From<glam::USizeVec2> for BlockSize { | ||
| fn from(vec: glam::USizeVec2) -> Self { | ||
| BlockSize::xy(vec.x as u32, vec.y as u32) | ||
| BlockSize::xy(vec.x, vec.y) | ||
| } | ||
| } | ||
| impl From<glam::USizeVec3> for BlockSize { | ||
| fn from(vec: glam::USizeVec3) -> Self { | ||
| BlockSize::xyz(vec.x as u32, vec.y as u32, vec.z as u32) | ||
| BlockSize::xyz(vec.x, vec.y, vec.z) | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -66,7 +66,7 @@ | |
| //! vary by device. Query device properties when you need exact limits. | ||
| //! | ||
| use cuda_std_macros::gpu_only; | ||
| use glam::{UVec2, UVec3}; | ||
| use glam::{USizeVec2, USizeVec3}; | ||
|
|
||
| // different calling conventions dont exist in nvptx, so we just use C as a placeholder. | ||
| unsafe extern "C" { | ||
|
|
@@ -99,116 +99,116 @@ macro_rules! in_range { | |
|
|
||
| #[gpu_only] | ||
| #[inline(always)] | ||
| pub fn thread_idx_x() -> u32 { | ||
| // The range is derived from the `block_idx_x` range. | ||
| in_range!(core::arch::nvptx::_thread_idx_x, 0..1024) | ||
| pub fn thread_idx_x() -> usize { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Again, I think this is defined by NVIDIA and it isn't tied to pointer size. The upstream rustc stuff uses
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think rustc uses I can't find where the type is defined in CUDA but to some degree it doesn't matter because C/C++ is so permissive about allowing any integer type to be used as an index. I've seen code examples in the CUDA docs where But because Rust only allows |
||
| // The range is derived from the `block_dim_x` range. | ||
| in_range!(core::arch::nvptx::_thread_idx_x, 0..1024) as usize | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unrelated to current change, but I find the range here suspicious:
The "current GPUs" suggests this is not an API promise, but mearly the current highest value of this. Can this raise in the future? What happens then?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See table 30 in this section for a more specific description of the limits here. You are right that it's not a guarantee, but the relevant numbers haven't changed from Compute Capability 5.0 all the way to 12.x.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looking at this I realize that some of these comments have an error in them. I'll fix that. |
||
| } | ||
|
|
||
| #[gpu_only] | ||
| #[inline(always)] | ||
| pub fn thread_idx_y() -> u32 { | ||
| // The range is derived from the `block_idx_y` range. | ||
| in_range!(core::arch::nvptx::_thread_idx_y, 0..1024) | ||
| pub fn thread_idx_y() -> usize { | ||
| // The range is derived from the `block_dim_y` range. | ||
| in_range!(core::arch::nvptx::_thread_idx_y, 0..1024) as usize | ||
| } | ||
|
|
||
| #[gpu_only] | ||
| #[inline(always)] | ||
| pub fn thread_idx_z() -> u32 { | ||
| // The range is derived from the `block_idx_z` range. | ||
| in_range!(core::arch::nvptx::_thread_idx_z, 0..64) | ||
| pub fn thread_idx_z() -> usize { | ||
| // The range is derived from the `block_dim_z` range. | ||
| in_range!(core::arch::nvptx::_thread_idx_z, 0..64) as usize | ||
| } | ||
|
|
||
| #[gpu_only] | ||
| #[inline(always)] | ||
| pub fn block_idx_x() -> u32 { | ||
| // The range is derived from the `grid_idx_x` range. | ||
| in_range!(core::arch::nvptx::_block_idx_x, 0..2147483647) | ||
| pub fn block_idx_x() -> usize { | ||
| // The range is derived from the `grid_dim_x` range. | ||
| in_range!(core::arch::nvptx::_block_idx_x, 0..2147483647) as usize | ||
| } | ||
|
|
||
| #[gpu_only] | ||
| #[inline(always)] | ||
| pub fn block_idx_y() -> u32 { | ||
| // The range is derived from the `grid_idx_y` range. | ||
| in_range!(core::arch::nvptx::_block_idx_y, 0..65535) | ||
| pub fn block_idx_y() -> usize { | ||
| // The range is derived from the `grid_dim_y` range. | ||
| in_range!(core::arch::nvptx::_block_idx_y, 0..65535) as usize | ||
| } | ||
|
|
||
| #[gpu_only] | ||
| #[inline(always)] | ||
| pub fn block_idx_z() -> u32 { | ||
| // The range is derived from the `grid_idx_z` range. | ||
| in_range!(core::arch::nvptx::_block_idx_z, 0..65535) | ||
| pub fn block_idx_z() -> usize { | ||
| // The range is derived from the `grid_dim_z` range. | ||
| in_range!(core::arch::nvptx::_block_idx_z, 0..65535) as usize | ||
| } | ||
|
|
||
| #[gpu_only] | ||
| #[inline(always)] | ||
| pub fn block_dim_x() -> u32 { | ||
| pub fn block_dim_x() -> usize { | ||
| // CUDA Compute Capabilities: "Maximum x- or y-dimensionality of a block" is 1024. | ||
| in_range!(core::arch::nvptx::_block_dim_x, 1..=1024) | ||
| in_range!(core::arch::nvptx::_block_dim_x, 1..=1024) as usize | ||
| } | ||
|
|
||
| #[gpu_only] | ||
| #[inline(always)] | ||
| pub fn block_dim_y() -> u32 { | ||
| pub fn block_dim_y() -> usize { | ||
| // CUDA Compute Capabilities: "Maximum x- or y-dimensionality of a block" is 1024. | ||
| in_range!(core::arch::nvptx::_block_dim_y, 1..=1024) | ||
| in_range!(core::arch::nvptx::_block_dim_y, 1..=1024) as usize | ||
| } | ||
|
|
||
| #[gpu_only] | ||
| #[inline(always)] | ||
| pub fn block_dim_z() -> u32 { | ||
| pub fn block_dim_z() -> usize { | ||
| // CUDA Compute Capabilities: "Maximum z-dimension of a block" is 64. | ||
| in_range!(core::arch::nvptx::_block_dim_z, 1..=64) | ||
| in_range!(core::arch::nvptx::_block_dim_z, 1..=64) as usize | ||
| } | ||
|
|
||
| #[gpu_only] | ||
| #[inline(always)] | ||
| pub fn grid_dim_x() -> u32 { | ||
| pub fn grid_dim_x() -> usize { | ||
| // CUDA Compute Capabilities: "Maximum x-dimension of a grid of thread blocks" is 2^32 - 1. | ||
| in_range!(core::arch::nvptx::_grid_dim_x, 1..=2147483647) | ||
| in_range!(core::arch::nvptx::_grid_dim_x, 1..=2147483647) as usize | ||
| } | ||
|
|
||
| #[gpu_only] | ||
| #[inline(always)] | ||
| pub fn grid_dim_y() -> u32 { | ||
| pub fn grid_dim_y() -> usize { | ||
| // CUDA Compute Capabilities: "Maximum y- or z-dimension of a grid of thread blocks" is 65535. | ||
| in_range!(core::arch::nvptx::_grid_dim_y, 1..=65535) | ||
| in_range!(core::arch::nvptx::_grid_dim_y, 1..=65535) as usize | ||
| } | ||
|
|
||
| #[gpu_only] | ||
| #[inline(always)] | ||
| pub fn grid_dim_z() -> u32 { | ||
| pub fn grid_dim_z() -> usize { | ||
| // CUDA Compute Capabilities: "Maximum y- or z-dimension of a grid of thread blocks" is 65535. | ||
| in_range!(core::arch::nvptx::_grid_dim_z, 1..=65535) | ||
| in_range!(core::arch::nvptx::_grid_dim_z, 1..=65535) as usize | ||
| } | ||
|
|
||
| /// Gets the 3d index of the thread currently executing the kernel. | ||
| #[gpu_only] | ||
| #[inline(always)] | ||
| pub fn thread_idx() -> UVec3 { | ||
| UVec3::new(thread_idx_x(), thread_idx_y(), thread_idx_z()) | ||
| pub fn thread_idx() -> USizeVec3 { | ||
| USizeVec3::new(thread_idx_x(), thread_idx_y(), thread_idx_z()) | ||
| } | ||
|
|
||
| /// Gets the 3d index of the block that the thread currently executing the kernel is located in. | ||
| #[gpu_only] | ||
| #[inline(always)] | ||
| pub fn block_idx() -> UVec3 { | ||
| UVec3::new(block_idx_x(), block_idx_y(), block_idx_z()) | ||
| pub fn block_idx() -> USizeVec3 { | ||
| USizeVec3::new(block_idx_x(), block_idx_y(), block_idx_z()) | ||
| } | ||
|
|
||
| /// Gets the 3d layout of the thread blocks executing this kernel. In other words, | ||
| /// how many threads exist in each thread block in every direction. | ||
| #[gpu_only] | ||
| #[inline(always)] | ||
| pub fn block_dim() -> UVec3 { | ||
| UVec3::new(block_dim_x(), block_dim_y(), block_dim_z()) | ||
| pub fn block_dim() -> USizeVec3 { | ||
| USizeVec3::new(block_dim_x(), block_dim_y(), block_dim_z()) | ||
| } | ||
|
|
||
| /// Gets the 3d layout of the block grids executing this kernel. In other words, | ||
| /// how many thread blocks exist in each grid in every direction. | ||
| #[gpu_only] | ||
| #[inline(always)] | ||
| pub fn grid_dim() -> UVec3 { | ||
| UVec3::new(grid_dim_x(), grid_dim_y(), grid_dim_z()) | ||
| pub fn grid_dim() -> USizeVec3 { | ||
| USizeVec3::new(grid_dim_x(), grid_dim_y(), grid_dim_z()) | ||
| } | ||
|
|
||
| /// Gets the overall thread index, accounting for 1d/2d/3d block/grid dimensions. This | ||
|
|
@@ -220,7 +220,7 @@ pub fn grid_dim() -> UVec3 { | |
| #[gpu_only] | ||
| #[rustfmt::skip] | ||
| #[inline(always)] | ||
| pub fn index() -> u32 { | ||
| pub fn index() -> usize { | ||
| let grid_dim = grid_dim(); | ||
| let block_idx = block_idx(); | ||
| let block_dim = block_dim(); | ||
|
|
@@ -235,31 +235,31 @@ pub fn index() -> u32 { | |
| } | ||
|
|
||
| #[inline(always)] | ||
| pub fn index_1d() -> u32 { | ||
| thread_idx_x() as u32 + block_idx_x() as u32 * block_dim_x() as u32 | ||
| pub fn index_1d() -> usize { | ||
| thread_idx_x() + block_idx_x() * block_dim_x() | ||
| } | ||
|
|
||
| #[inline(always)] | ||
| pub fn index_2d() -> UVec2 { | ||
| pub fn index_2d() -> USizeVec2 { | ||
| let i = thread_idx_x() + block_idx_x() * block_dim_x(); | ||
| let j = thread_idx_y() + block_idx_y() * block_dim_y(); | ||
| UVec2::new(i, j) | ||
| USizeVec2::new(i, j) | ||
| } | ||
|
|
||
| #[inline(always)] | ||
| pub fn index_3d() -> UVec3 { | ||
| pub fn index_3d() -> USizeVec3 { | ||
| let i = thread_idx_x() + block_idx_x() * block_dim_x(); | ||
| let j = thread_idx_y() + block_idx_y() * block_dim_y(); | ||
| let k = thread_idx_z() + block_idx_z() * block_dim_z(); | ||
| UVec3::new(i, j, k) | ||
| USizeVec3::new(i, j, k) | ||
| } | ||
|
|
||
| /// Whether this is the first thread (not the first thread to be executing). This function is guaranteed | ||
| /// to only return true in a single thread that is invoking it. This is useful for only doing something | ||
| /// once. | ||
| #[inline(always)] | ||
| pub fn first() -> bool { | ||
| block_idx() == UVec3::ZERO && thread_idx() == UVec3::ZERO | ||
| block_idx() == USizeVec3::ZERO && thread_idx() == USizeVec3::ZERO | ||
| } | ||
|
|
||
| /// Gets the number of threads inside of a warp. Currently 32 threads on every GPU architecture. | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe consider keeping the old impl, if that does not cause any issues(to allow casting from
u32too).IDK if this would be of any worth.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I thought about it but none of the existing example code requires it so it doesn't seem useful. We can add it back if necessary.