Schema to configure remote machines¶
In the below schema, the machine and resources dicts accept all keywords as in the dpdispatcher package
- Support romote machines of various batch types: Bash, Slurm, OpenPBS, SGE, TORQUE, LSF, OpenAPI, ...
- Support various connection contexts: Local, SSH, HDFS, OpenAPI, ...
Schema:¶
### The `machine` and `resources` dicts accept all keywords as in the dpdispatcher package
# machine dict: https://docs.deepmodeling.com/projects/dpdispatcher/en/latest/machine.html
# resources dict: https://docs.deepmodeling.com/projects/dpdispatcher/en/latest/resources.html
common_schema: &common ### Configure remote machine
type: dict
schema:
machine: ### ANCHOR: Configure parameters for login to remote machine.
type: dict
required: True
allow_unknown: True
schema:
batch_type: # batch system type. Choices:'Bash' 'Slurm', 'OpenPBS', 'SGE', 'TORQUE', 'LSF', 'OpenAPI'. See more: https://docs.deepmodeling.com/projects/dpdispatcher/en/latest/batch.html
type: string
context_type: # context type. Choices: 'Local', 'SSH', 'HDFS', 'OpenAPI'.See more: https://docs.deepmodeling.com/projects/dpdispatcher/en/latest/context.html
type: string
remote_root: # remote root directory
type: string
remote_profile: # profile
type: dict
schema:
hostname: # hostname
type: string
username: # username
type: string
password: # password
type: string
port: # port
type: integer
timeout: # timeout
type: integer
execute_command: # command to execute right after login
type: string
resources: ### ANCHOR: Configure resources on remote machine.
type: dict
required: True
allow_unknown: True
schema:
number_node: # number of nodes.
type: integer
cpu_per_node: # number of CPUs per node.
type: integer
gpu_per_node: # number of GPUs per node.
type: integer
custom_flags: # list[str] of custom flags
type: list
module_list: # list[str] of modules
type: list
source_list: # source list[str]
type: list
envs: # environment variables
type: dict
command: # command to execute at remote machine.
type: string
job_limit: # maximum jobs in one submission to cluster. Default is 5.
type: integer
work_load_ratio: # ratio of total jobs to run on this machine. If not set, all jobs will be distributed equally between all machines.
type: float
tha: *common ### ANCHOR: Configure remote machine for general purpose.
train: *common ### ANCHOR: Configure remote machine to run training.
md: *common ### ANCHOR: Configure remote machine to run MD simulation.
dft: *common ### ANCHOR: Configure remote machine to run DFT calculation.
Example configuration:¶
##### SECTION: ML training
##### ANCHOR: SLURM - Local
train_1:
machine:
batch_type: Slurm
context_type: SSHContext
remote_root: /home/tha/_job_local/train
remote_profile:
hostname: xxx.xxx.xxx.1
username: tiny_bird
password: little_finger
port: 2225
timeout: 20
resources:
group_size: 1
number_node: 1
cpu_per_node: 2
gpu_per_node: 2
custom_flags:
- "#SBATCH --job-name=ztr"
- "#SBATCH --time=168:00:00"
- "#SBATCH --partition=all" # Partition name, run `sinfo` to get a list of partitions. Use to instead of queue_name
# - "#SBATCH --gres=shard:1"
- "source /etc/profile.d/modules.sh"
- "module use /home/tha/app/1modulefiles"
module_list:
- conda/py12sevenn
# source_list:
# - /etc/profile.d/modules.sh
envs:
OMP_NUM_THREADS: 1
OMPI_MCA_opal_cuda_support: 1
UCX_MEMTYPE_CACHE: n
# command: sevenn
job_limit: 1
work_load_ratio: 0.25 # ratio of total jobs to run on this machine.
##### ANCHOR: SLURM - Rocky
train_2:
machine:
batch_type: Slurm
context_type: SSHContext
remote_root: /home/tha/_job_rocky/train
remote_profile:
hostname: xxx.xxx.xxx.2
username: tiny_bird
password: little_finger
port: 2225
timeout: 20
resources:
group_size: 1
number_node: 1
cpu_per_node: 2
gpu_per_node: 2
custom_flags:
- "#SBATCH --job-name=ztr"
- "#SBATCH --time=168:00:00"
- "#SBATCH --partition=all" # Partition name, run `sinfo` to get a list of partitions. Use to instead of queue_name
# - "#SBATCH --gres=shard:1"
- "source /etc/profile.d/modules.sh"
- "module use /home/tha/app/1modulefiles"
module_list:
- conda/py12sevenn
# source_list:
# - /etc/profile.d/modules.sh
envs:
OMP_NUM_THREADS: 1
OMPI_MCA_opal_cuda_support: 1
UCX_MEMTYPE_CACHE: n
# command: sevenn
job_limit: 1
work_load_ratio: 0.25 # ratio of total jobs to run on this machine.
##### ANCHOR: SLURM - Ubuntu
train_3:
machine:
batch_type: Slurm
context_type: SSHContext
remote_root: /home/tha/_job_ubuntu/train
remote_profile:
hostname: xxx.xxx.xxx.3
username: tiny_bird
password: little_finger
port: 2225
timeout: 20
resources:
group_size: 1
number_node: 1
cpu_per_node: 1
gpu_per_node: 1
custom_flags:
- "#SBATCH --job-name=ztr"
- "#SBATCH --time=168:00:00"
# - "#SBATCH --mem=30G"
- "#SBATCH --partition=all" # Partition name, run `sinfo` to get a list of partitions. Use to instead of queue_name
# - "#SBATCH --gres=shard:4"
- "source /etc/profile.d/modules.sh"
- "module use /home/tha/app/1modulefiles"
module_list:
- conda/py12sevenn
# source_list:
# - /etc/profile.d/modules.sh
envs:
OMP_NUM_THREADS: 1
OMPI_MCA_opal_cuda_support: 1
UCX_MEMTYPE_CACHE: n
# command: sevenn
job_limit: 2
# work_load_ratio: 0.4
##### !SECTION
##### SECTION: MD run
##### ANCHOR: SLURM: Local-ASE
md_1:
machine:
batch_type: Slurm
context_type: SSHContext
remote_root: /home/tha/_job_local/md
remote_profile:
hostname: xxx.xxx.xxx.4
username: tiny_bird
password: little_finger
port: 2225
timeout: 20
resources:
group_size: 1
number_node: 1
cpu_per_node: 1
gpu_per_node: 1
custom_flags:
- "#SBATCH --job-name=zmd"
- "#SBATCH --time=168:00:00"
- "#SBATCH --mem=22G"
- "#SBATCH --partition=all" # Partition name, run `sinfo` to get a list of partitions. Use to instead of queue_name
- "#SBATCH --gres=shard:2"
- "source /etc/profile.d/modules.sh"
- "module use /home/tha/app/1modulefiles"
module_list:
- conda/py12lmpSevenn
# source_list:
# - /etc/profile.d/modules.sh
envs:
NP: $SLURM_NTASKS
OMP_NUM_THREADS: 1
OMPI_MCA_opal_cuda_support: 1
UCX_MEMTYPE_CACHE: n
# CUDA_VISIBLE_DEVICES: "" # disable GPU
# command: "mpirun -np $NP --allow-run-as-root lmp_mpi"
job_limit: 2
work_load_ratio: 0.25
##### ANCHOR: SLURM: Rocky-ASE
### to disable GPU: `export=CUDA_VISIBLE_DEVICES=""`
md_2:
machine:
batch_type: Slurm
context_type: SSHContext
remote_root: /home/tha/_job_rocky/md
remote_profile:
hostname: xxx.xxx.xxx.2
username: tiny_bird
password: little_finger
port: 2225
timeout: 20
resources:
group_size: 1
number_node: 1
cpu_per_node: 1
gpu_per_node: 1
custom_flags:
- "#SBATCH --job-name=zmd"
- "#SBATCH --time=168:00:00"
- "#SBATCH --mem=30G"
- "#SBATCH --partition=all" # Partition name, run `sinfo` to get a list of partitions. Use to instead of queue_name
- "#SBATCH --gres=shard:2"
- "source /etc/profile.d/modules.sh"
- "module use /home/tha/app/1modulefiles"
module_list:
- conda/py12lmpSevenn
# source_list:
# - /etc/profile.d/modules.sh
envs:
NP: $SLURM_NTASKS
OMP_NUM_THREADS: 1
OMPI_MCA_opal_cuda_support: 1
UCX_MEMTYPE_CACHE: n
# CUDA_VISIBLE_DEVICES: "" # disable GPU
# command: "mpirun -np $NP lmp_mpi"
job_limit: 2
work_load_ratio: 0.25 # ratio of total jobs to run on this machine.
##### ANCHOR: SLURM: Ubuntu GPU
md_3:
machine:
batch_type: Slurm
context_type: SSHContext
remote_root: /home/tha/_job_ubuntu/md
remote_profile:
hostname: xxx.xxx.xxx.3
username: tiny_bird
password: little_finger
port: 2225
timeout: 20
resources:
group_size: 1
number_node: 1
cpu_per_node: 1
gpu_per_node: 1
custom_flags:
- "#SBATCH --job-name=zmd"
- "#SBATCH --time=168:00:00"
- "#SBATCH --mem=30G"
- "#SBATCH --partition=all" # Partition name, run `sinfo` to get a list of partitions. Use to instead of queue_name
- "#SBATCH --gres=shard:2"
- "source /etc/profile.d/modules.sh"
- "module use /home/tha/app/1modulefiles"
module_list:
- conda/py12lmpSevenn
# source_list:
# - /etc/profile.d/modules.sh
envs:
NP: $SLURM_NTASKS
OMP_NUM_THREADS: 1
OMPI_MCA_opal_cuda_support: 1
UCX_MEMTYPE_CACHE: n
# CUDA_VISIBLE_DEVICES: "" # disable GPU
# command: "mpirun -np $NP lmp_mpi"
job_limit: 4
# work_load_ratio: 0.4 # ratio of total jobs to run on this machine.
##### ANCHOR: SGE: Centos - LAMMPS
md_4:
machine:
batch_type: SGE
context_type: SSHContext
remote_root: /home1/tha/_job_tachyon/md
remote_profile:
hostname: xxx.xxx.xxx.5
username: tiny_bird
password: little_finger
port: 2225
timeout: 20
resources:
group_size: 1
queue_name: "ib.q,1g.q"
# cpu_per_node: 8
kwargs:
# pe_name: mpi_8
job_name: zal_md
custom_flags:
- "#$ -l h_rt=168:00:00"
# - "#$ -l h=!(com001|com003|)"
module_list:
- mpi/openmpi4.1.7-clang17-IB
- conda/py12sevenn
- lammps/llvmOMPI4-sevenn
source_list:
- /etc/profile.d/modules.sh
envs:
OMP_NUM_THREADS: 1
OMPI_MCA_btl_openib_allow_ib: 1
# command: sevenn
job_limit: 10
# work_load_ratio: 0.6
##### !SECTION
##### SECTION: DFT calculation
##### ANCHOR: SLURM: Local
dft_1:
machine:
batch_type: Slurm
context_type: SSHContext
remote_root: /home/tha/_job_local/w24_WSL_dft
remote_profile:
hostname: xxx.xxx.xxx.4
username: tiny_bird
password: little_finger
port: 2225
timeout: 20
resources:
group_size: 1
number_node: 1
cpu_per_node: 1
gpu_per_node: 1
custom_flags:
- "#SBATCH --job-name=zfp"
- "#SBATCH --time=168:00:00"
- "#SBATCH --mem=22G"
- "#SBATCH --partition=all" # Partition name, run `sinfo` to get a list of partitions. Use to instead of queue_name
- "#SBATCH --gres=shard:2"
- "source /etc/profile.d/modules.sh"
- "module use /home/tha/app/1modulefiles"
module_list:
- conda/py12gpaw_gpu
# source_list:
# - /etc/profile.d/modules.sh
envs:
NP: $SLURM_NTASKS
OMP_NUM_THREADS: 1
OMPI_MCA_opal_cuda_support: 1
UCX_MEMTYPE_CACHE: n
GPAW_NEW: 1
GPAW_USE_GPUS: 1
# command: "mpirun -np $NP --bind-to core:overload-allowed gpaw python"
command: "mpirun -np $NP gpaw python"
job_limit: 2
work_load_ratio: 0.25 # ratio of total jobs to run on this machine.
##### ANCHOR: SLURM: Rocky
dft_2:
machine:
batch_type: Slurm
context_type: SSHContext
remote_root: /home/tha/_job_rocky/dft
remote_profile:
hostname: xxx.xxx.xxx.2
username: tiny_bird
password: little_finger
port: 2225
timeout: 20
resources:
group_size: 1
number_node: 1
cpu_per_node: 1
gpu_per_node: 1
custom_flags:
- "#SBATCH --job-name=zfp"
- "#SBATCH --time=168:00:00"
- "#SBATCH --mem=30G"
- "#SBATCH --partition=all" # Partition name, run `sinfo` to get a list of partitions. Use to instead of queue_name
- "#SBATCH --gres=shard:2"
- "source /etc/profile.d/modules.sh"
- "module use /home/tha/app/1modulefiles"
module_list:
- conda/py12gpaw_gpu
# source_list:
# - /etc/profile.d/modules.sh
envs:
NP: $SLURM_NTASKS
OMP_NUM_THREADS: 1
OMPI_MCA_opal_cuda_support: 1
UCX_MEMTYPE_CACHE: n
GPAW_NEW: 1
GPAW_USE_GPUS: 1
command: "mpirun -np $NP gpaw python"
job_limit: 2
work_load_ratio: 0.25 # ratio of total jobs to run on this machine.
##### ANCHOR: SLURM: Ubuntu GPU
dft_3:
machine:
batch_type: Slurm
context_type: SSHContext
remote_root: /home/tha/_job_ubuntu/dft
remote_profile:
hostname: xxx.xxx.xxx.3
username: tiny_bird
password: little_finger
port: 2225
timeout: 20
resources:
group_size: 1
number_node: 1
cpu_per_node: 1
gpu_per_node: 1
custom_flags:
- "#SBATCH --job-name=zfp"
- "#SBATCH --time=168:00:00"
- "#SBATCH --mem=30G"
- "#SBATCH --partition=all" # Partition name, run `sinfo` to get a list of partitions. Use to instead of queue_name
- "#SBATCH --gres=shard:2"
- "source /etc/profile.d/modules.sh"
- "module use /home/tha/app/1modulefiles"
module_list:
- conda/py12gpaw_gpu
# source_list:
# - /etc/profile.d/modules.sh
envs:
NP: $SLURM_NTASKS
OMP_NUM_THREADS: 1
OMPI_MCA_opal_cuda_support: 1
UCX_MEMTYPE_CACHE: n
GPAW_NEW: 1
GPAW_USE_GPUS: 1
command: "mpirun -np $NP gpaw python"
job_limit: 4
# work_load_ratio: 0.4 # ratio of total jobs to run on this machine.
##### !SECTION