Fix save checkpoint logic for TPUs

Lightning-AI · Apr 6, 2021 · caa2275 · caa2275
1 parent beda8e8
commit caa2275
Show file tree

Hide file tree

Showing 2 changed files with 8 additions and 4 deletions.
diff --git a/pytorch_lightning/plugins/training_type/tpu_spawn.py b/pytorch_lightning/plugins/training_type/tpu_spawn.py
@@ -87,6 +87,8 @@ def new_process(self, process_idx: int, trainer, mp_queue) -> None:
         trainer.accelerator.setup_optimizers(trainer)
         trainer.precision_plugin.connect(self._model, None, None)
 
+        # replace trainer save_checkpoint to use `xm.save`
+        trainer.save_checkpoint = self.save_checkpoint
         self.barrier("pre-run-stage")
 
         results = trainer.train_or_test_or_predict()
@@ -201,12 +203,14 @@ def test_step(self, *args, **kwargs):
     def predict(self, *args, **kwargs):
         return self.lightning_module.predict(*args, **kwargs)
 
-    def save_checkpoint(self, checkpoint: Dict[str, Any], filepath: str) -> None:
+    def save_checkpoint(self, filepath: str, weights_only: bool = False) -> None:
         """Save model/training states as a checkpoint file through state-dump and file-write.
         Args:
-            checkpoint: dict containing model and trainer state
             filepath: write-target file's path
+            weights_only: saving model weights only
         """
+        # dump states as a checkpoint dictionary object
+        checkpoint = self.lightning_module.trainer.checkpoint_connector.dump_checkpoint(weights_only)
         # Todo: TypeError: 'mappingproxy' object does not support item assignment
         if _OMEGACONF_AVAILABLE:
             checkpoint = apply_to_collection(checkpoint, (DictConfig, ListConfig), OmegaConf.to_container)

diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
@@ -210,8 +210,8 @@ def test_tpu_grad_norm(tmpdir):
         progress_bar_refresh_rate=0,
         max_epochs=4,
         tpu_cores=1,
-        limit_train_batches=0.7,
-        limit_val_batches=0.7,
+        limit_train_batches=10,
+        limit_val_batches=10,
         gradient_clip_val=0.5,
     )