diff --git a/lerobot/common/datasets/push_dataset_to_hub/aloha_hdf5_format.py b/lerobot/common/datasets/push_dataset_to_hub/aloha_hdf5_format.py
index 1c2f066e..b8a89b58 100644
--- a/lerobot/common/datasets/push_dataset_to_hub/aloha_hdf5_format.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/aloha_hdf5_format.py
@@ -43,8 +43,7 @@ def get_cameras(hdf5_data):
 
 
 def check_format(raw_dir) -> bool:
-    # only frames from simulation are uncompressed
-    compressed_images = "sim" not in raw_dir.name
+    compressed_images = None
 
     hdf5_paths = list(raw_dir.glob("episode_*.hdf5"))
     assert len(hdf5_paths) != 0
@@ -62,18 +61,20 @@ def check_format(raw_dir) -> bool:
             for camera in get_cameras(data):
                 assert num_frames == data[f"/observations/images/{camera}"].shape[0]
 
-                if compressed_images:
-                    assert data[f"/observations/images/{camera}"].ndim == 2
+                assert data[f"/observations/images/{camera}"].ndim in [2, 4]
+                if data[f"/observations/images/{camera}"].ndim == 2:
+                    assert compressed_images is None or compressed_images
+                    compressed_images = True
                 else:
+                    assert compressed_images is None or not compressed_images
+                    compressed_images = False
                     assert data[f"/observations/images/{camera}"].ndim == 4
                     b, h, w, c = data[f"/observations/images/{camera}"].shape
                     assert c < h and c < w, f"Expect (h,w,c) image format but ({h=},{w=},{c=}) provided."
+    return compressed_images
 
 
-def load_from_raw(raw_dir, out_dir, fps, video, debug):
-    # only frames from simulation are uncompressed
-    compressed_images = "sim" not in raw_dir.name
-
+def load_from_raw(raw_dir, out_dir, fps, video, debug, compressed_images):
     hdf5_files = list(raw_dir.glob("*.hdf5"))
     ep_dicts = []
     episode_data_index = {"from": [], "to": []}
@@ -199,12 +200,12 @@ def to_hf_dataset(data_dict, video) -> Dataset:
 
 def from_raw_to_lerobot_format(raw_dir: Path, out_dir: Path, fps=None, video=True, debug=False):
     # sanity check
-    check_format(raw_dir)
+    compressed_images = check_format(raw_dir)
 
     if fps is None:
         fps = 50
 
-    data_dir, episode_data_index = load_from_raw(raw_dir, out_dir, fps, video, debug)
+    data_dir, episode_data_index = load_from_raw(raw_dir, out_dir, fps, video, debug, compressed_images)
     hf_dataset = to_hf_dataset(data_dir, video)
 
     info = {
diff --git a/lerobot/configs/env/aloha_thom.yaml b/lerobot/configs/env/aloha_thom.yaml
new file mode 100644
index 00000000..a6570da7
--- /dev/null
+++ b/lerobot/configs/env/aloha_thom.yaml
@@ -0,0 +1,14 @@
+# @package _global_
+
+fps: 50
+
+env:
+  name: aloha
+  task: AlohaInsertion-v0
+  from_pixels: True
+  pixels_only: False
+  image_size: [3, 480, 640]
+  episode_length: 500
+  fps: ${fps}
+  state_dim: 6
+  action_dim: 6
diff --git a/lerobot/configs/policy/act_thom.yaml b/lerobot/configs/policy/act_thom.yaml
new file mode 100644
index 00000000..7574cce0
--- /dev/null
+++ b/lerobot/configs/policy/act_thom.yaml
@@ -0,0 +1,77 @@
+# @package _global_
+
+seed: 1000
+dataset_repo_id: lerobot/aloha_sim_insertion_human
+
+training:
+  offline_steps: 20000
+  online_steps: 0
+  eval_freq: 100000
+  save_freq: 200
+  log_freq: 200
+  save_model: true
+
+  batch_size: 8
+  lr: 1e-5
+  lr_backbone: 1e-5
+  weight_decay: 1e-4
+  grad_clip_norm: 10
+  online_steps_between_rollouts: 1
+
+  delta_timestamps:
+    action: "[i / ${fps} for i in range(${policy.chunk_size})]"
+
+eval:
+  n_episodes: 50
+  batch_size: 50
+
+# See `configuration_act.py` for more details.
+policy:
+  name: act
+
+  # Input / output structure.
+  n_obs_steps: 1
+  chunk_size: 100 # chunk_size
+  n_action_steps: 100
+
+  input_shapes:
+    # TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env?
+    observation.images: [3, 480, 640]
+    observation.state: ["${env.state_dim}"]
+  output_shapes:
+    action: ["${env.action_dim}"]
+
+  # Normalization / Unnormalization
+  input_normalization_modes:
+    observation.images.front: mean_std
+    observation.state: mean_std
+  output_normalization_modes:
+    action: mean_std
+
+  # Architecture.
+  # Vision backbone.
+  vision_backbone: resnet18
+  pretrained_backbone_weights: ResNet18_Weights.IMAGENET1K_V1
+  replace_final_stride_with_dilation: false
+  # Transformer layers.
+  pre_norm: false
+  dim_model: 512
+  n_heads: 8
+  dim_feedforward: 3200
+  feedforward_activation: relu
+  n_encoder_layers: 4
+  # Note: Although the original ACT implementation has 7 for `n_decoder_layers`, there is a bug in the code
+  # that means only the first layer is used. Here we match the original implementation by setting this to 1.
+  # See this issue https://github.com/tonyzhaozh/act/issues/25#issue-2258740521.
+  n_decoder_layers: 1
+  # VAE.
+  use_vae: true
+  latent_dim: 32
+  n_vae_encoder_layers: 4
+
+  # Inference.
+  temporal_ensemble_momentum: null
+
+  # Training and loss computation.
+  dropout: 0.1
+  kl_weight: 10.0
diff --git a/lerobot/scripts/train.py b/lerobot/scripts/train.py
index 2b28943d..0e67d8a6 100644
--- a/lerobot/scripts/train.py
+++ b/lerobot/scripts/train.py
@@ -23,6 +23,7 @@ import hydra
 import torch
 from omegaconf import DictConfig
 from torch.cuda.amp import GradScaler
+from tqdm import tqdm
 
 from lerobot.common.datasets.factory import make_dataset
 from lerobot.common.datasets.utils import cycle
@@ -319,8 +320,8 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
 
     policy.train()
     is_offline = True
-    for step in range(cfg.training.offline_steps):
-        if step == 0:
+    for offline_step in tqdm(range(cfg.training.offline_steps)):
+        if offline_step == 0:
             logging.info("Start offline training on a fixed dataset")
         batch = next(dl_iter)
 
@@ -338,12 +339,12 @@ def train(cfg: DictConfig, out_dir: str | None = None, job_name: str | None = No
         )
 
         # TODO(rcadene): is it ok if step_t=0 = 0 and not 1 as previously done?
-        if step % cfg.training.log_freq == 0:
-            log_train_info(logger, train_info, step, cfg, offline_dataset, is_offline)
+        if offline_step % cfg.training.log_freq == 0:
+            log_train_info(logger, train_info, offline_step, cfg, offline_dataset, is_offline)
 
         # Note: evaluate_and_checkpoint_if_needed happens **after** the `step`th training update has completed,
         # so we pass in step + 1.
-        evaluate_and_checkpoint_if_needed(step + 1)
+        evaluate_and_checkpoint_if_needed(offline_step + 1)
 
     # create an empty online dataset similar to offline dataset
     online_dataset = deepcopy(offline_dataset)