The first challenge stems from the fact that autoregressive models need to define a sequential order for tokens, which does not naturally exist for 3D shapes.

OctGPT( (split_emb): Embedding(2, 1152) (class_emb): Embedding(1, 1152) (vq_proj): Linear(in_features=32, out_features=1152, bias=True) (encoder): OctFormer( (layers): OctFormerStage( (blocks): ModuleList( (0): OctFormerBlock( (norm1): RMSNorm() (attention): OctreeAttention( dim=1152, patch_size=1024, num_heads=8, dilation=1 (qkv): Linear(in_features=1152, out_features=3456, bias=True) (proj): Linear(in_features=1152, out_features=1152, bias=True) (proj_drop): Dropout(p=0.1, inplace=False) (softmax): Softmax(dim=-1) (rope): RotaryPosEmb() ) (norm2): RMSNorm() (mlp): MLP( (fc1): Linear(in_features=1152, out_features=4608, bias=True) (act): GELU(approximate='none') (fc2): Linear(in_features=4608, out_features=1152, bias=True) (drop): Dropout(p=0.1, inplace=True) ) (dropout): Dropout(p=0.1, inplace=False) (pos_emb): AbsPosEmb( (depth_emb): Embedding(4, 1152) ) ) (1): OctFormerBlock( (norm1): RMSNorm() (attention): OctreeAttention( dim=1152, patch_size=1024, num_heads=8, dilation=8 (qkv): Linear(in_features=1152, out_features=3456, bias=True) (proj): Linear(in_features=1152, out_features=1152, bias=True) (proj_drop): Dropout(p=0.1, inplace=False) (softmax): Softmax(dim=-1) (rope): RotaryPosEmb() ) (norm2): RMSNorm() (mlp): MLP( (fc1): Linear(in_features=1152, out_features=4608, bias=True) (act): GELU(approximate='none') (fc2): Linear(in_features=4608, out_features=1152, bias=True) (drop): Dropout(p=0.1, inplace=True) ) (dropout): Dropout(p=0.1, inplace=False) (pos_emb): AbsPosEmb( (depth_emb): Embedding(4, 1152) ) ) (2): OctFormerBlock( (norm1): RMSNorm() (attention): OctreeAttention( dim=1152, patch_size=1024, num_heads=8, dilation=1 (qkv): Linear(in_features=1152, out_features=3456, bias=True) (proj): Linear(in_features=1152, out_features=1152, bias=True) (proj_drop): Dropout(p=0.1, inplace=False) (softmax): Softmax(dim=-1) (rope): RotaryPosEmb() ) (norm2): RMSNorm() (mlp): MLP( (fc1): Linear(in_features=1152, out_features=4608, bias=True) (act): GELU(approximate='none') (fc2): Linear(in_features=4608, out_features=1152, bias=True) (drop): Dropout(p=0.1, inplace=True) ) (dropout): Dropout(p=0.1, inplace=False) (pos_emb): AbsPosEmb( (depth_emb): Embedding(4, 1152) ) ) (3): OctFormerBlock( (norm1): RMSNorm() (attention): OctreeAttention( dim=1152, patch_size=1024, num_heads=8, dilation=8 (qkv): Linear(in_features=1152, out_features=3456, bias=True) (proj): Linear(in_features=1152, out_features=1152, bias=True) (proj_drop): Dropout(p=0.1, inplace=False) (softmax): Softmax(dim=-1) (rope): RotaryPosEmb() ) (norm2): RMSNorm() (mlp): MLP( (fc1): Linear(in_features=1152, out_features=4608, bias=True) (act): GELU(approximate='none') (fc2): Linear(in_features=4608, out_features=1152, bias=True) (drop): Dropout(p=0.1, inplace=True) ) (dropout): Dropout(p=0.1, inplace=False) (pos_emb): AbsPosEmb( (depth_emb): Embedding(4, 1152) ) ) (4): OctFormerBlock( (norm1): RMSNorm() (attention): OctreeAttention( dim=1152, patch_size=1024, num_heads=8, dilation=1 (qkv): Linear(in_features=1152, out_features=3456, bias=True) (proj): Linear(in_features=1152, out_features=1152, bias=True) (proj_drop): Dropout(p=0.1, inplace=False) (softmax): Softmax(dim=-1) (rope): RotaryPosEmb() ) (norm2): RMSNorm() (mlp): MLP( (fc1): Linear(in_features=1152, out_features=4608, bias=True) (act): GELU(approximate='none') (fc2): Linear(in_features=4608, out_features=1152, bias=True) (drop): Dropout(p=0.1, inplace=True) ) (dropout): Dropout(p=0.1, inplace=False) (pos_emb): AbsPosEmb( (depth_emb): Embedding(4, 1152) ) ) (5): OctFormerBlock( (norm1): RMSNorm() (attention): OctreeAttention( dim=1152, patch_size=1024, num_heads=8, dilation=8 (qkv): Linear(in_features=1152, out_features=3456, bias=True) (proj): Linear(in_features=1152, out_features=1152, bias=True) (proj_drop): Dropout(p=0.1, inplace=False) (softmax): Softmax(dim=-1) (rope): RotaryPosEmb() ) (norm2): RMSNorm() (mlp): MLP( (fc1): Linear(in_features=1152, out_features=4608, bias=True) (act): GELU(approximate='none') (fc2): Linear(in_features=4608, out_features=1152, bias=True) (drop): Dropout(p=0.1, inplace=True) ) (dropout): Dropout(p=0.1, inplace=False) (pos_emb): AbsPosEmb( (depth_emb): Embedding(4, 1152) ) ) (6): OctFormerBlock( (norm1): RMSNorm() (attention): OctreeAttention( dim=1152, patch_size=1024, num_heads=8, dilation=1 (qkv): Linear(in_features=1152, out_features=3456, bias=True) (proj): Linear(in_features=1152, out_features=1152, bias=True) (proj_drop): Dropout(p=0.1, inplace=False) (softmax): Softmax(dim=-1) (rope): RotaryPosEmb() ) (norm2): RMSNorm() (mlp): MLP( (fc1): Linear(in_features=1152, out_features=4608, bias=True) (act): GELU(approximate='none') (fc2): Linear(in_features=4608, out_features=1152, bias=True) (drop): Dropout(p=0.1, inplace=True) ) (dropout): Dropout(p=0.1, inplace=False) (pos_emb): AbsPosEmb( (depth_emb): Embedding(4, 1152) ) ) (7): OctFormerBlock( (norm1): RMSNorm() (attention): OctreeAttention( dim=1152, patch_size=1024, num_heads=8, dilation=8 (qkv): Linear(in_features=1152, out_features=3456, bias=True) (proj): Linear(in_features=1152, out_features=1152, bias=True) (proj_drop): Dropout(p=0.1, inplace=False) (softmax): Softmax(dim=-1) (rope): RotaryPosEmb() ) (norm2): RMSNorm() (mlp): MLP( (fc1): Linear(in_features=1152, out_features=4608, bias=True) (act): GELU(approximate='none') (fc2): Linear(in_features=4608, out_features=1152, bias=True) (drop): Dropout(p=0.1, inplace=True) ) (dropout): Dropout(p=0.1, inplace=False) (pos_emb): AbsPosEmb( (depth_emb): Embedding(4, 1152) ) ) (8): OctFormerBlock( (norm1): RMSNorm() (attention): OctreeAttention( dim=1152, patch_size=1024, num_heads=8, dilation=1 (qkv): Linear(in_features=1152, out_features=3456, bias=True) (proj): Linear(in_features=1152, out_features=1152, bias=True) (proj_drop): Dropout(p=0.1, inplace=False) (softmax): Softmax(dim=-1) (rope): RotaryPosEmb() ) (norm2): RMSNorm() (mlp): MLP( (fc1): Linear(in_features=1152, out_features=4608, bias=True) (act): GELU(approximate='none') (fc2): Linear(in_features=4608, out_features=1152, bias=True) (drop): Dropout(p=0.1, inplace=True) ) (dropout): Dropout(p=0.1, inplace=False) (pos_emb): AbsPosEmb( (depth_emb): Embedding(4, 1152) ) ) (9): OctFormerBlock( (norm1): RMSNorm() (attention): OctreeAttention( dim=1152, patch_size=1024, num_heads=8, dilation=8 (qkv): Linear(in_features=1152, out_features=3456, bias=True) (proj): Linear(in_features=1152, out_features=1152, bias=True) (proj_drop): Dropout(p=0.1, inplace=False) (softmax): Softmax(dim=-1) (rope): RotaryPosEmb() ) (norm2): RMSNorm() (mlp): MLP( (fc1): Linear(in_features=1152, out_features=4608, bias=True) (act): GELU(approximate='none') (fc2): Linear(in_features=4608, out_features=1152, bias=True) (drop): Dropout(p=0.1, inplace=True) ) (dropout): Dropout(p=0.1, inplace=False) (pos_emb): AbsPosEmb( (depth_emb): Embedding(4, 1152) ) ) (10): OctFormerBlock( (norm1): RMSNorm() (attention): OctreeAttention( dim=1152, patch_size=1024, num_heads=8, dilation=1 (qkv): Linear(in_features=1152, out_features=3456, bias=True) (proj): Linear(in_features=1152, out_features=1152, bias=True) (proj_drop): Dropout(p=0.1, inplace=False) (softmax): Softmax(dim=-1) (rope): RotaryPosEmb() ) (norm2): RMSNorm() (mlp): MLP( (fc1): Linear(in_features=1152, out_features=4608, bias=True) (act): GELU(approximate='none') (fc2): Linear(in_features=4608, out_features=1152, bias=True) (drop): Dropout(p=0.1, inplace=True) ) (dropout): Dropout(p=0.1, inplace=False) (pos_emb): AbsPosEmb( (depth_emb): Embedding(4, 1152) ) ) (11): OctFormerBlock( (norm1): RMSNorm() (attention): OctreeAttention( dim=1152, patch_size=1024, num_heads=8, dilation=8 (qkv): Linear(in_features=1152, out_features=3456, bias=True) (proj): Linear(in_features=1152, out_features=1152, bias=True) (proj_drop): Dropout(p=0.1, inplace=False) (softmax): Softmax(dim=-1) (rope): RotaryPosEmb() ) (norm2): RMSNorm() (mlp): MLP( (fc1): Linear(in_features=1152, out_features=4608, bias=True) (act): GELU(approximate='none') (fc2): Linear(in_features=4608, out_features=1152, bias=True) (drop): Dropout(p=0.1, inplace=True) ) (dropout): Dropout(p=0.1, inplace=False) (pos_emb): AbsPosEmb( (depth_emb): Embedding(4, 1152) ) ) ) ) ) (encoder_ln): RMSNorm() (decoder): OctFormer( (layers): OctFormerStage( (blocks): ModuleList( (0): OctFormerBlock( (norm1): RMSNorm() (attention): OctreeAttention( dim=1152, patch_size=1024, num_heads=8, dilation=1 (qkv): Linear(in_features=1152, out_features=3456, bias=True) (proj): Linear(in_features=1152, out_features=1152, bias=True) (proj_drop): Dropout(p=0.1, inplace=False) (softmax): Softmax(dim=-1) (rope): RotaryPosEmb() ) (norm2): RMSNorm() (mlp): MLP( (fc1): Linear(in_features=1152, out_features=4608, bias=True) (act): GELU(approximate='none') (fc2): Linear(in_features=4608, out_features=1152, bias=True) (drop): Dropout(p=0.1, inplace=True) ) (dropout): Dropout(p=0.1, inplace=False) (pos_emb): AbsPosEmb( (depth_emb): Embedding(4, 1152) ) ) (1): OctFormerBlock( (norm1): RMSNorm() (attention): OctreeAttention( dim=1152, patch_size=1024, num_heads=8, dilation=8 (qkv): Linear(in_features=1152, out_features=3456, bias=True) (proj): Linear(in_features=1152, out_features=1152, bias=True) (proj_drop): Dropout(p=0.1, inplace=False) (softmax): Softmax(dim=-1) (rope): RotaryPosEmb() ) (norm2): RMSNorm() (mlp): MLP( (fc1): Linear(in_features=1152, out_features=4608, bias=True) (act): GELU(approximate='none') (fc2): Linear(in_features=4608, out_features=1152, bias=True) (drop): Dropout(p=0.1, inplace=True) ) (dropout): Dropout(p=0.1, inplace=False) (pos_emb): AbsPosEmb( (depth_emb): Embedding(4, 1152) ) ) (2): OctFormerBlock( (norm1): RMSNorm() (attention): OctreeAttention( dim=1152, patch_size=1024, num_heads=8, dilation=1 (qkv): Linear(in_features=1152, out_features=3456, bias=True) (proj): Linear(in_features=1152, out_features=1152, bias=True) (proj_drop): Dropout(p=0.1, inplace=False) (softmax): Softmax(dim=-1) (rope): RotaryPosEmb() ) (norm2): RMSNorm() (mlp): MLP( (fc1): Linear(in_features=1152, out_features=4608, bias=True) (act): GELU(approximate='none') (fc2): Linear(in_features=4608, out_features=1152, bias=True) (drop): Dropout(p=0.1, inplace=True) ) (dropout): Dropout(p=0.1, inplace=False) (pos_emb): AbsPosEmb( (depth_emb): Embedding(4, 1152) ) ) (3): OctFormerBlock( (norm1): RMSNorm() (attention): OctreeAttention( dim=1152, patch_size=1024, num_heads=8, dilation=8 (qkv): Linear(in_features=1152, out_features=3456, bias=True) (proj): Linear(in_features=1152, out_features=1152, bias=True) (proj_drop): Dropout(p=0.1, inplace=False) (softmax): Softmax(dim=-1) (rope): RotaryPosEmb() ) (norm2): RMSNorm() (mlp): MLP( (fc1): Linear(in_features=1152, out_features=4608, bias=True) (act): GELU(approximate='none') (fc2): Linear(in_features=4608, out_features=1152, bias=True) (drop): Dropout(p=0.1, inplace=True) ) (dropout): Dropout(p=0.1, inplace=False) (pos_emb): AbsPosEmb( (depth_emb): Embedding(4, 1152) ) ) (4): OctFormerBlock( (norm1): RMSNorm() (attention): OctreeAttention( dim=1152, patch_size=1024, num_heads=8, dilation=1 (qkv): Linear(in_features=1152, out_features=3456, bias=True) (proj): Linear(in_features=1152, out_features=1152, bias=True) (proj_drop): Dropout(p=0.1, inplace=False) (softmax): Softmax(dim=-1) (rope): RotaryPosEmb() ) (norm2): RMSNorm() (mlp): MLP( (fc1): Linear(in_features=1152, out_features=4608, bias=True) (act): GELU(approximate='none') (fc2): Linear(in_features=4608, out_features=1152, bias=True) (drop): Dropout(p=0.1, inplace=True) ) (dropout): Dropout(p=0.1, inplace=False) (pos_emb): AbsPosEmb( (depth_emb): Embedding(4, 1152) ) ) (5): OctFormerBlock( (norm1): RMSNorm() (attention): OctreeAttention( dim=1152, patch_size=1024, num_heads=8, dilation=8 (qkv): Linear(in_features=1152, out_features=3456, bias=True) (proj): Linear(in_features=1152, out_features=1152, bias=True) (proj_drop): Dropout(p=0.1, inplace=False) (softmax): Softmax(dim=-1) (rope): RotaryPosEmb() ) (norm2): RMSNorm() (mlp): MLP( (fc1): Linear(in_features=1152, out_features=4608, bias=True) (act): GELU(approximate='none') (fc2): Linear(in_features=4608, out_features=1152, bias=True) (drop): Dropout(p=0.1, inplace=True) ) (dropout): Dropout(p=0.1, inplace=False) (pos_emb): AbsPosEmb( (depth_emb): Embedding(4, 1152) ) ) (6): OctFormerBlock( (norm1): RMSNorm() (attention): OctreeAttention( dim=1152, patch_size=1024, num_heads=8, dilation=1 (qkv): Linear(in_features=1152, out_features=3456, bias=True) (proj): Linear(in_features=1152, out_features=1152, bias=True) (proj_drop): Dropout(p=0.1, inplace=False) (softmax): Softmax(dim=-1) (rope): RotaryPosEmb() ) (norm2): RMSNorm() (mlp): MLP( (fc1): Linear(in_features=1152, out_features=4608, bias=True) (act): GELU(approximate='none') (fc2): Linear(in_features=4608, out_features=1152, bias=True) (drop): Dropout(p=0.1, inplace=True) ) (dropout): Dropout(p=0.1, inplace=False) (pos_emb): AbsPosEmb( (depth_emb): Embedding(4, 1152) ) ) (7): OctFormerBlock( (norm1): RMSNorm() (attention): OctreeAttention( dim=1152, patch_size=1024, num_heads=8, dilation=8 (qkv): Linear(in_features=1152, out_features=3456, bias=True) (proj): Linear(in_features=1152, out_features=1152, bias=True) (proj_drop): Dropout(p=0.1, inplace=False) (softmax): Softmax(dim=-1) (rope): RotaryPosEmb() ) (norm2): RMSNorm() (mlp): MLP( (fc1): Linear(in_features=1152, out_features=4608, bias=True) (act): GELU(approximate='none') (fc2): Linear(in_features=4608, out_features=1152, bias=True) (drop): Dropout(p=0.1, inplace=True) ) (dropout): Dropout(p=0.1, inplace=False) (pos_emb): AbsPosEmb( (depth_emb): Embedding(4, 1152) ) ) (8): OctFormerBlock( (norm1): RMSNorm() (attention): OctreeAttention( dim=1152, patch_size=1024, num_heads=8, dilation=1 (qkv): Linear(in_features=1152, out_features=3456, bias=True) (proj): Linear(in_features=1152, out_features=1152, bias=True) (proj_drop): Dropout(p=0.1, inplace=False) (softmax): Softmax(dim=-1) (rope): RotaryPosEmb() ) (norm2): RMSNorm() (mlp): MLP( (fc1): Linear(in_features=1152, out_features=4608, bias=True) (act): GELU(approximate='none') (fc2): Linear(in_features=4608, out_features=1152, bias=True) (drop): Dropout(p=0.1, inplace=True) ) (dropout): Dropout(p=0.1, inplace=False) (pos_emb): AbsPosEmb( (depth_emb): Embedding(4, 1152) ) ) (9): OctFormerBlock( (norm1): RMSNorm() (attention): OctreeAttention( dim=1152, patch_size=1024, num_heads=8, dilation=8 (qkv): Linear(in_features=1152, out_features=3456, bias=True) (proj): Linear(in_features=1152, out_features=1152, bias=True) (proj_drop): Dropout(p=0.1, inplace=False) (softmax): Softmax(dim=-1) (rope): RotaryPosEmb() ) (norm2): RMSNorm() (mlp): MLP( (fc1): Linear(in_features=1152, out_features=4608, bias=True) (act): GELU(approximate='none') (fc2): Linear(in_features=4608, out_features=1152, bias=True) (drop): Dropout(p=0.1, inplace=True) ) (dropout): Dropout(p=0.1, inplace=False) (pos_emb): AbsPosEmb( (depth_emb): Embedding(4, 1152) ) ) (10): OctFormerBlock( (norm1): RMSNorm() (attention): OctreeAttention( dim=1152, patch_size=1024, num_heads=8, dilation=1 (qkv): Linear(in_features=1152, out_features=3456, bias=True) (proj): Linear(in_features=1152, out_features=1152, bias=True) (proj_drop): Dropout(p=0.1, inplace=False) (softmax): Softmax(dim=-1) (rope): RotaryPosEmb() ) (norm2): RMSNorm() (mlp): MLP( (fc1): Linear(in_features=1152, out_features=4608, bias=True) (act): GELU(approximate='none') (fc2): Linear(in_features=4608, out_features=1152, bias=True) (drop): Dropout(p=0.1, inplace=True) ) (dropout): Dropout(p=0.1, inplace=False) (pos_emb): AbsPosEmb( (depth_emb): Embedding(4, 1152) ) ) (11): OctFormerBlock( (norm1): RMSNorm() (attention): OctreeAttention( dim=1152, patch_size=1024, num_heads=8, dilation=8 (qkv): Linear(in_features=1152, out_features=3456, bias=True) (proj): Linear(in_features=1152, out_features=1152, bias=True) (proj_drop): Dropout(p=0.1, inplace=False) (softmax): Softmax(dim=-1) (rope): RotaryPosEmb() ) (norm2): RMSNorm() (mlp): MLP( (fc1): Linear(in_features=1152, out_features=4608, bias=True) (act): GELU(approximate='none') (fc2): Linear(in_features=4608, out_features=1152, bias=True) (drop): Dropout(p=0.1, inplace=True) ) (dropout): Dropout(p=0.1, inplace=False) (pos_emb): AbsPosEmb( (depth_emb): Embedding(4, 1152) ) ) ) ) ) (decoder_ln): RMSNorm() (split_head): Linear(in_features=1152, out_features=2, bias=True) (vq_head): Linear(in_features=1152, out_features=64, bias=True) )