Skip to content

Commit

Permalink
Fix save checkpoint logic for TPUs
Browse files Browse the repository at this point in the history
  • Loading branch information
kaushikb11 committed Apr 6, 2021
1 parent beda8e8 commit caa2275
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 4 deletions.
8 changes: 6 additions & 2 deletions pytorch_lightning/plugins/training_type/tpu_spawn.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@ def new_process(self, process_idx: int, trainer, mp_queue) -> None:
trainer.accelerator.setup_optimizers(trainer)
trainer.precision_plugin.connect(self._model, None, None)

# replace trainer save_checkpoint to use `xm.save`
trainer.save_checkpoint = self.save_checkpoint
self.barrier("pre-run-stage")

results = trainer.train_or_test_or_predict()
Expand Down Expand Up @@ -201,12 +203,14 @@ def test_step(self, *args, **kwargs):
def predict(self, *args, **kwargs):
return self.lightning_module.predict(*args, **kwargs)

def save_checkpoint(self, checkpoint: Dict[str, Any], filepath: str) -> None:
def save_checkpoint(self, filepath: str, weights_only: bool = False) -> None:
"""Save model/training states as a checkpoint file through state-dump and file-write.
Args:
checkpoint: dict containing model and trainer state
filepath: write-target file's path
weights_only: saving model weights only
"""
# dump states as a checkpoint dictionary object
checkpoint = self.lightning_module.trainer.checkpoint_connector.dump_checkpoint(weights_only)
# Todo: TypeError: 'mappingproxy' object does not support item assignment
if _OMEGACONF_AVAILABLE:
checkpoint = apply_to_collection(checkpoint, (DictConfig, ListConfig), OmegaConf.to_container)
Expand Down
4 changes: 2 additions & 2 deletions tests/models/test_tpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,8 +210,8 @@ def test_tpu_grad_norm(tmpdir):
progress_bar_refresh_rate=0,
max_epochs=4,
tpu_cores=1,
limit_train_batches=0.7,
limit_val_batches=0.7,
limit_train_batches=10,
limit_val_batches=10,
gradient_clip_val=0.5,
)

Expand Down

0 comments on commit caa2275

Please sign in to comment.