generated from sehoffmann/python-template
-
Notifications
You must be signed in to change notification settings - Fork 0
Open
Labels
bugSomething isn't workingSomething isn't working
Description
=================================== FAILURES ===================================
___________________ TestSeed.test_multi_worker_deterministic ___________________
self = <test_seed.TestSeed object at 0x7f1b971e14b0>
distributed_environment = <function DistributedEnvironment.bind.<locals>.init at 0x7f1b96fd0160>
def test_multi_worker_deterministic(self, distributed_environment):
states = distributed_environment(4).start(seed, 42)
assert [s['seed'] for s in states] == [42, 42, 42, 42]
# workers should have different states
assert all((s['torch_state'] != states[0]['torch_state']).any() for s in states[1:])
assert all((s['numpy_state'] != states[0]['numpy_state']).any() for s in states[1:])
assert all((s['random_state'] != states[0]['random_state']).any() for s in states[1:])
# same seed should yield same states
new_states = distributed_environment(4).start(seed, 42)
assert [s['seed'] for s in new_states] == [42, 42, 42, 42]
assert all((s1['torch_state'] == s2['torch_state']).all() for s1, s2 in zip(states, new_states))
assert all((s1['numpy_state'] == s2['numpy_state']).all() for s1, s2 in zip(states, new_states))
assert all((s1['random_state'] == s2['random_state']).all() for s1, s2 in zip(states, new_states))
# different seed should yield different states
> new_states = distributed_environment(4).start(seed, 11)
test/test_seed.py:84:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
test/conftest.py:63: in start
ret = conn.recv()
/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/multiprocessing/connection.py:250: in recv
buf = self._recv_bytes()
/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/multiprocessing/connection.py:414: in _recv_bytes
buf = self._recv(4)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <multiprocessing.connection.Connection object at 0x7f1b971db640>
size = 4, read = <built-in function read>
def _recv(self, size, read=_read):
buf = io.BytesIO()
handle = self._handle
remaining = size
while remaining > 0:
chunk = read(handle, remaining)
n = len(chunk)
if n == 0:
if remaining == size:
> raise EOFError
E EOFError
/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/multiprocessing/connection.py:383: EOFError
----------------------------- Captured stderr call -----------------------------
[E108 14:52:04.084461486 ProcessGroupGloo.cpp:143] Gloo connectFullMesh failed with [../third_party/gloo/gloo/transport/tcp/pair.cc:144] no error
Process SpawnProcess-33:
Traceback (most recent call last):
File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/home/runner/work/dmlcloud/dmlcloud/test/conftest.py", line 29, in _run
torch.distributed.barrier()
File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 83, in wrapper
return func(*args, **kwargs)
File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 4164, in barrier
work.wait()
RuntimeError: [../third_party/gloo/gloo/transport/tcp/pair.cc:525] Read error [10.1.0.114]:61849: Connection reset by peer
Process SpawnProcess-32:
Traceback (most recent call last):
File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/home/runner/work/dmlcloud/dmlcloud/test/conftest.py", line 27, in _run
torch.distributed.init_process_group(backend='gloo', world_size=world_size, rank=rank, store=store)
File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 83, in wrapper
return func(*args, **kwargs)
File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 97, in wrapper
func_return = func(*args, **kwargs)
File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1527, in init_process_group
default_pg, _ = _new_process_group_helper(
File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1744, in _new_process_group_helper
backend_class = ProcessGroupGloo(
RuntimeError: Gloo connectFullMesh failed with [../third_party/gloo/gloo/transport/tcp/pair.cc:144] no error
[E108 17:22:04.053240347 ProcessGroupGloo.cpp:143] Gloo connectFullMesh failed with [../third_party/gloo/gloo/transport/tcp/pair.h:301] Connect timeout [none]
Process SpawnProcess-34:
Traceback (most recent call last):
File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/home/runner/work/dmlcloud/dmlcloud/test/conftest.py", line 27, in _run
torch.distributed.init_process_group(backend='gloo', world_size=world_size, rank=rank, store=store)
File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 83, in wrapper
return func(*args, **kwargs)
File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 97, in wrapper
func_return = func(*args, **kwargs)
File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1527, in init_process_group
default_pg, _ = _new_process_group_helper(
File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1744, in _new_process_group_helper
backend_class = ProcessGroupGloo(
RuntimeError: Gloo connectFullMesh failed with [../third_party/gloo/gloo/transport/tcp/pair.h:301] Connect timeout [none]
[E108 17:22:04.118509503 ProcessGroupGloo.cpp:143] Gloo connectFullMesh failed with [../third_party/gloo/gloo/transport/tcp/pair.h:301] Connect timeout [none]
Process SpawnProcess-31:
Traceback (most recent call last):
File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/home/runner/work/dmlcloud/dmlcloud/test/conftest.py", line 27, in _run
torch.distributed.init_process_group(backend='gloo', world_size=world_size, rank=rank, store=store)
File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 83, in wrapper
return func(*args, **kwargs)
File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 97, in wrapper
func_return = func(*args, **kwargs)
File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1527, in init_process_group
default_pg, _ = _new_process_group_helper(
File "/opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1744, in _new_process_group_helper
backend_class = ProcessGroupGloo(
RuntimeError: Gloo connectFullMesh failed with [../third_party/gloo/gloo/transport/tcp/pair.h:301] Connect timeout [none]
=============================== warnings summary ===============================
test/test_seed.py::TestSeed::test_single_worker_deterministic
/home/runner/work/dmlcloud/dmlcloud/test/test_seed.py:23: DeprecationWarning: __array__ implementation doesn't accept a copy keyword, so passing copy=False failed. __array__ must implement 'dtype' and 'copy' keyword arguments. To learn more, see the migration guide https://numpy.org/devdocs/numpy_2_0_migration_guide.html#adapting-to-changes-in-the-copy-keyword
prev_torch_state = np.array(torch.get_rng_state())Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working