I’m running a jupyter notebook on the INRC cloud which I’m launching with SLURM (following the example on the INRC website). The remainder of my code is just imports and loading the model from TF, but the output and error response is as follows:
INFO:DRV: SLURM is being run in background
INFO:DRV: Connecting to 10.212.98.108:33191
INFO:DRV: Host server up..............Done 0.19s
INFO:DRV: Encoding axons/synapses.....Done 0.24s
INFO:DRV: Compiling Embedded snips....Done 0.36s
INFO:DRV: SLURM is being run in background
INFO:DRV: Connecting to 10.212.98.108:36617
INFO:DRV: Host server up..............Done 0.15s
INFO:DRV: Encoding axons/synapses.....Done 0.25s
INFO:DRV: Compiling Embedded snips....Done 0.30s
INFO:DRV: Encoding probes.............Done 1.72ms
INFO:HST: Args chip=0 cpu=0 /homes/sumbad/nengo_venv/lib/python3.5/site-packages/nxsdk/driver/compilers/../../../temp/1614132815.6668978/launcher_chip0_lmt0.bin --chips=1 --remote-relay=1
INFO:DRV: Booting up..................Done 2.64s
INFO:HST: Lakemont_driver...
INFO:DRV: Transferring spikes.........Done 1.39s
INFO:DRV: Transferring probes.........Done 6.10ms
INFO:DRV: Configuring registers.......Done 0.05s
INFO:HST: srun: Force Terminated job 1066710
INFO:HST: srun: Job step aborted: Waiting up to 32 seconds for job step to finish.
INFO:HST: slurmstepd: error: *** STEP 1066710.0 ON ncl-ext-ghrd-04 CANCELLED AT 2021-02-24T02:33:35 DUE TO TIME LIMIT ***
INFO:DRV: Executing...................Error 1189.28s
INFO:DRV: Executor: 1000 timesteps........Error 1190.73s
INFO:HST: srun: error: ncl-ext-ghrd-04: task 0: Terminated
---------------------------------------------------------------------------
_InactiveRpcError Traceback (most recent call last)
<ipython-input-13-85dbb53b9b0a> in <module>
13 with sim:
---> 14 sim.run(run_time)
~/nengo_venv/lib/python3.5/site-packages/nengo_loihi/simulator.py in run(self, time_in_seconds)
329 )
--> 330 self.run_steps(steps)
331
~/nengo_venv/lib/python3.5/site-packages/nengo_loihi/simulator.py in run_steps(self, steps)
342
--> 343 self._runner.run_steps(steps)
344 self._n_steps += steps
~/nengo_venv/lib/python3.5/site-packages/nengo_loihi/simulator.py in loihi_precomputed_host_pre_only(self, steps)
505 self._host2chip(self.loihi)
--> 506 self.loihi.run_steps(steps, blocking=True)
507 self.timers.stop("run")
~/nengo_venv/lib/python3.5/site-packages/nengo_loihi/hardware/interface.py in run_steps(self, steps, blocking)
252 # start the board running the desired number of steps
--> 253 d_get(self.nxsdk_board, b"cnVu")(steps, **{d(b"YVN5bmM="): not blocking})
254
~/nengo_venv/lib/python3.5/site-packages/nxsdk/graph/nxboard.py in run(self, numSteps, aSync, maxTimeInterval, generateCfg, cfgPath, partition)
261 aSync=aSync,
--> 262 traceDirectory=traceDirectory)
263 else:
~/nengo_venv/lib/python3.5/site-packages/nxsdk/graph/nxboard.py in _run(self, numSteps, aSync, traceDirectory)
232 self, traceDirectory=traceDirectory)
--> 233 self.executor.start(numSteps, aSync)
234
~/nengo_venv/lib/python3.5/site-packages/nxsdk/driver/executor.py in start(self, numSteps, aSync)
82 if not aSync:
---> 83 self.finish()
84
~/nengo_venv/lib/python3.5/site-packages/nxsdk/driver/executor.py in finish(self)
119 if self._state is ExecutionState.RUNNING:
--> 120 self._wait()
121 self._notifyListeners(ExecutionEventEnum.POST_EXECUTION)
~/nengo_venv/lib/python3.5/site-packages/nxsdk/driver/executor.py in _wait(self)
126 with timedContextLogging("Executing", NxSDKLogger.NXDRIVER):
--> 127 self._executor_service.waitExecution(empty)
128
~/nengo_venv/lib/python3.5/site-packages/grpc/_channel.py in __call__(self, request, timeout, metadata, credentials, wait_for_ready, compression)
922 wait_for_ready, compression)
--> 923 return _end_unary_response_blocking(state, call, False, None)
924
~/nengo_venv/lib/python3.5/site-packages/grpc/_channel.py in _end_unary_response_blocking(state, call, with_call, deadline)
825 else:
--> 826 raise _InactiveRpcError(state)
827
_InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
status = StatusCode.UNAVAILABLE
details = "Socket closed"
debug_error_string = "{"created":"@1614134015.573146850","description":"Error received from peer ipv4:10.212.98.108:36617","file":"src/core/lib/surface/call.cc","file_line":1067,"grpc_message":"Socket closed","grpc_status":14}"
>
During handling of the above exception, another exception occurred:
_InactiveRpcError Traceback (most recent call last)
<ipython-input-13-85dbb53b9b0a> in <module>
12
13 with sim:
---> 14 sim.run(run_time)
~/nengo_venv/lib/python3.5/site-packages/nengo_loihi/simulator.py in __exit__(self, exc_type, exc_value, traceback)
215 def __exit__(self, exc_type, exc_value, traceback):
216 for sim in self.sims.values():
--> 217 sim.__exit__(exc_type, exc_value, traceback)
218 self.close()
219
~/nengo_venv/lib/python3.5/site-packages/nengo_loihi/hardware/interface.py in __exit__(self, exc_type, exc_value, traceback)
127
128 def __exit__(self, exc_type, exc_value, traceback):
--> 129 self.close()
130
131 @classmethod
~/nengo_venv/lib/python3.5/site-packages/nengo_loihi/hardware/interface.py in close(self)
159
160 if self.nxsdk_board is not None:
--> 161 d_func(self.nxsdk_board, b"ZGlzY29ubmVjdA==")
162 self.nxsdk_board = None
163
~/nengo_venv/lib/python3.5/site-packages/nengo_loihi/nxsdk_obfuscation.py in d_func(obj, kwargs, *attrs)
75 kwargs = {deobfuscate(k): v for k, v in kwargs.items()}
76 func = d_get(obj, *attrs)
---> 77 return func(**kwargs)
~/nengo_venv/lib/python3.5/site-packages/nxsdk/graph/nxboard.py in disconnect(self)
319 """
320 BasicSpikeGenerator.isSpikeGenProcessConfigured = False
--> 321 self.executor.stop()
322 self._executor = None
323
~/nengo_venv/lib/python3.5/site-packages/nxsdk/driver/executor.py in stop(self, force)
94 _force.force = force
95 self._executor_service.stopExecution(_force)
---> 96 self._notifyListeners(ExecutionEventEnum.ON_STOP)
97 self._host_coordinator.stop()
98 self._state = ExecutionState.UNDEFINED
~/nengo_venv/lib/python3.5/site-packages/nxsdk/driver/executor.py in _notifyListeners(self, event)
147 listener.postExecution()
148 elif event == ExecutionEventEnum.ON_STOP:
--> 149 listener.onStop()
150 else:
151 raise Exception("Invalid event {}".format(event))
~/nengo_venv/lib/python3.5/site-packages/nxsdk/driver/listeners/lakemont_orchestrator.py in onStop(self)
39 def onStop(self) -> None:
40 """Stops the lakemont driver"""
---> 41 self.stopLmtDriver(empty)
~/nengo_venv/lib/python3.5/site-packages/grpc/_channel.py in __call__(self, request, timeout, metadata, credentials, wait_for_ready, compression)
921 state, call, = self._blocking(request, timeout, metadata, credentials,
922 wait_for_ready, compression)
--> 923 return _end_unary_response_blocking(state, call, False, None)
924
925 def with_call(self,
~/nengo_venv/lib/python3.5/site-packages/grpc/_channel.py in _end_unary_response_blocking(state, call, with_call, deadline)
824 return state.response
825 else:
--> 826 raise _InactiveRpcError(state)
827
828
_InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
status = StatusCode.UNAVAILABLE
details = "failed to connect to all addresses"
debug_error_string = "{"created":"@1614134015.577352099","description":"Failed to pick subchannel","file":"src/core/ext/filters/client_channel/client_channel.cc","file_line":5390,"referenced_errors":[{"created":"@1614134015.577347950","description":"failed to connect to all addresses","file":"src/core/ext/filters/client_channel/lb_policy/pick_first/pick_first.cc","file_line":397,"grpc_status":14}]}"
>
I tried using a “nahuku32” partition just now and received the same timeout error. I’m pretty sure I’m connecting properly because I was able to get output from a “loihi” partition without the energy probe. I’m willing to share more code since the CNN as it is now is nothing groundbreaking, but what would the recommended format be? Should I upload the full .ipynb or save just the code itself to a text file?