Schema for configuring the remote machines¶

The top_key in machine_dict should start with special prefix to be able sort out for each type of calculations.

Examples: - train_1, train_2,... for training jobs - lammps_1, lammps_2,... for lammps jobs - gpaw_1, gpaw_2,... for gpaw jobs

Schema:¶

### The `machine` and `resources` dicts accept all keywords as in the dpdispatcher package
  # machine dict: https://docs.deepmodeling.com/projects/dpdispatcher/en/latest/machine.html
  # resources dict: https://docs.deepmodeling.com/projects/dpdispatcher/en/latest/resources.html

common_schema: &common    ### Configure remote machine
  type: dict
  schema:
    machine:              ### ANCHOR: Configure parameters for login to remote machine.
      type: dict
      required: True
      allow_unknown: False
      schema:
        batch_type:           # batch system type. Supports:'Bash', 'Slurm', 'SGE', 'OpenPBS', 'TORQUE', 'LSF', 'JH_UniScheduler', 'Bohrium', 'DistributedShell', 'Fugaku', 'OpenAPI'. See more: https://docs.deepmodeling.com/projects/dpdispatcher/en/latest/batch.html
          type: string
          allowed: ['Bash', 'Slurm', 'SGE', 'OpenPBS', 'TORQUE', 'LSF', 'JH_UniScheduler', 'Bohrium', 'DistributedShell', 'Fugaku', 'OpenAPI']
        context_type:         # context type. Supports: 'Local', 'SSH', 'Bohrium', 'HDFS', 'OpenAPI'.See more: https://docs.deepmodeling.com/projects/dpdispatcher/en/latest/context.html
          type: string
          allowed: ['Local', 'SSH', 'Bohrium', 'HDFS', 'OpenAPI']
        remote_root:          # remote root directory
          type: string
        retry_count:          # number of retries if job fails. Default is 3.
          type: integer
        remote_profile:       # profile
          type: dict
          schema:
            hostname:         # hostname
              type: string
            username:         # username
              type: string
            password:         # password
              type: string
            port:             # port
              type: integer
            timeout:          # timeout
              type: integer
            execute_command:  # command to execute right after login
              type: string

    resources:            ### ANCHOR: Configure resources on remote machine.
      type: dict
      required: True
      allow_unknown: True
      schema:
        number_node:          # number of nodes.
          type: integer
        cpu_per_node:         # number of CPUs per node.
          type: integer
        gpu_per_node:         # number of GPUs per node.
          type: integer
        custom_flags:         # list[str] of custom flags
          type: list
        module_list:          # list[str] of modules
          type: list
        source_list:          # source list[str]
          type: list
        envs:                 # environment variables
          type: dict

    command:              # command to execute at remote machine.
      type: string
    submit_size:            # maximum jobs in one submission to cluster. Default is 5.
      type: integer
    work_load_ratio:      # ratio of total jobs to run on this machine. If not set, all jobs will be distributed equally between all machines.
      type: float


tha: *common            ### ANCHOR: Configure remote machine for general purpose.

train: *common          ### ANCHOR: Configure remote machine to run training.

md: *common             ### ANCHOR: Configure remote machine to run MD simulation.

dft: *common            ### ANCHOR: Configure remote machine to run DFT calculation.

Example config 1:¶

### Example configuration file for multiple remote machines with different resources.


#####SECTION ML training
#####ANCHOR SLURM - Local
train_1:
  machine:
    batch_type: Slurm
    context_type: SSH
    remote_root: /home/tha/_job_local/train
    retry_count: 2
    remote_profile:
      hostname: xxx.xxx.xxx.1
      username: tiny_bird
      password: little_finger
      port: 2225
      timeout: 20

  resources:
    number_node: 1
    cpu_per_node: 2
    gpu_per_node: 2
    custom_flags:
      - "#SBATCH --job-name=ztr"
      - "#SBATCH --time=168:00:00"
      - "#SBATCH --partition=all"       # Partition name, run `sinfo` to get a list of partitions. Use to instead of queue_name
      # - "#SBATCH --gres=shard:1"

      - "source /etc/profile.d/modules.sh"
      - "module use /home/tha/app/1modulefiles"
    module_list:
      - conda/py12sevenn
      # source_list:
      #   - /etc/profile.d/modules.sh
    envs:
      OMP_NUM_THREADS: 1
      OMPI_MCA_opal_cuda_support: 1
      UCX_MEMTYPE_CACHE: n
  # command: sevenn
  submit_size: 1
  work_load_ratio: 0.25  # ratio of total jobs to run on this machine.


#####ANCHOR SLURM - Rocky
train_2:
  machine:
    batch_type: Slurm
    context_type: SSH
    remote_root: /home/tha/_job_rocky/train
    retry_count: 0
    remote_profile:
      hostname: xxx.xxx.xxx.2
      username: tiny_bird
      password: little_finger
      port: 2225
      timeout: 20

  resources:

    number_node: 1
    cpu_per_node: 2
    gpu_per_node: 2
    custom_flags:
      - "#SBATCH --job-name=ztr"
      - "#SBATCH --time=168:00:00"
      - "#SBATCH --partition=all"       # Partition name, run `sinfo` to get a list of partitions. Use to instead of queue_name
      # - "#SBATCH --gres=shard:1"

      - "source /etc/profile.d/modules.sh"
      - "module use /home/tha/app/1modulefiles"
    module_list:
      - conda/py12sevenn
      # source_list:
      #   - /etc/profile.d/modules.sh
    envs:
      OMP_NUM_THREADS: 1
      OMPI_MCA_opal_cuda_support: 1
      UCX_MEMTYPE_CACHE: n
  # command: sevenn
  submit_size: 1
  work_load_ratio: 0.25 # ratio of total jobs to run on this machine.


#####ANCHOR SLURM - Ubuntu
train_3:
  machine:
    batch_type: Slurm
    context_type: SSH
    remote_root: /home/tha/_job_ubuntu/train
    retry_count: 0
    remote_profile:
      hostname: xxx.xxx.xxx.3
      username: tiny_bird
      password: little_finger
      port: 2225
      timeout: 20

  resources:

    number_node: 1
    cpu_per_node: 1
    gpu_per_node: 1
    custom_flags:
      - "#SBATCH --job-name=ztr"
      - "#SBATCH --time=168:00:00"
      # - "#SBATCH --mem=30G"
      - "#SBATCH --partition=all"       # Partition name, run `sinfo` to get a list of partitions. Use to instead of queue_name
      # - "#SBATCH --gres=shard:4"

      - "source /etc/profile.d/modules.sh"
      - "module use /home/tha/app/1modulefiles"
    module_list:
      - conda/py12sevenn
    # source_list:
      #   - /etc/profile.d/modules.sh
    envs:
      OMP_NUM_THREADS: 1
      OMPI_MCA_opal_cuda_support: 1
      UCX_MEMTYPE_CACHE: n
  # command: sevenn
  submit_size: 2
  # work_load_ratio: 0.4
##### !SECTION


#####SECTION MD run
#####ANCHOR SLURM: Local-ASE
lammps_1:
  machine:
    batch_type: Slurm
    context_type: SSH
    remote_root: /home/tha/_job_local/md
    retry_count: 0
    remote_profile:
      hostname: xxx.xxx.xxx.4
      username: tiny_bird
      password: little_finger
      port: 2225
      timeout: 20

  resources:

    number_node: 1
    cpu_per_node: 1
    gpu_per_node: 1
    custom_flags:
      - "#SBATCH --job-name=zmd"
      - "#SBATCH --time=168:00:00"
      - "#SBATCH --mem=22G"
      - "#SBATCH --partition=all"       # Partition name, run `sinfo` to get a list of partitions. Use to instead of queue_name
      - "#SBATCH --gres=shard:2"

      - "source /etc/profile.d/modules.sh"
      - "module use /home/tha/app/1modulefiles"
    module_list:
      - conda/py12lmpSevenn
    # source_list:
    #   - /etc/profile.d/modules.sh
    envs:
      NP: $SLURM_NTASKS
      OMP_NUM_THREADS: 1
      OMPI_MCA_opal_cuda_support: 1
      UCX_MEMTYPE_CACHE: n
      # CUDA_VISIBLE_DEVICES: ""  # disable GPU
  # command: "mpirun -np $NP --allow-run-as-root lmp_mpi"
  submit_size: 2
  work_load_ratio: 0.25


#####ANCHOR SLURM: Rocky-ASE
### to disable GPU: `export=CUDA_VISIBLE_DEVICES=""`
lammps_2:
  machine:
    batch_type: Slurm
    context_type: SSH
    remote_root: /home/tha/_job_rocky/md
    retry_count: 0
    remote_profile:
      hostname: xxx.xxx.xxx.2
      username: tiny_bird
      password: little_finger
      port: 2225
      timeout: 20

  resources:

    number_node: 1
    cpu_per_node: 1
    gpu_per_node: 1
    custom_flags:
      - "#SBATCH --job-name=zmd"
      - "#SBATCH --time=168:00:00"
      - "#SBATCH --mem=30G"
      - "#SBATCH --partition=all"       # Partition name, run `sinfo` to get a list of partitions. Use to instead of queue_name
      - "#SBATCH --gres=shard:2"

      - "source /etc/profile.d/modules.sh"
      - "module use /home/tha/app/1modulefiles"
    module_list:
      - conda/py12lmpSevenn
    # source_list:
    #   - /etc/profile.d/modules.sh
    envs:
      NP: $SLURM_NTASKS
      OMP_NUM_THREADS: 1
      OMPI_MCA_opal_cuda_support: 1
      UCX_MEMTYPE_CACHE: n
      # CUDA_VISIBLE_DEVICES: ""  # disable GPU
  # command: "mpirun -np $NP lmp_mpi"
  submit_size: 2
  work_load_ratio: 0.25  # ratio of total jobs to run on this machine.


#####ANCHOR SLURM: Ubuntu GPU
lammps_3:
  machine:
    batch_type: Slurm
    context_type: SSH
    remote_root: /home/tha/_job_ubuntu/md
    retry_count: 0
    remote_profile:
      hostname: xxx.xxx.xxx.3
      username: tiny_bird
      password: little_finger
      port: 2225
      timeout: 20

  resources:

    number_node: 1
    cpu_per_node: 1
    gpu_per_node: 1
    custom_flags:
      - "#SBATCH --job-name=zmd"
      - "#SBATCH --time=168:00:00"
      - "#SBATCH --mem=30G"
      - "#SBATCH --partition=all"       # Partition name, run `sinfo` to get a list of partitions. Use to instead of queue_name
      - "#SBATCH --gres=shard:2"

      - "source /etc/profile.d/modules.sh"
      - "module use /home/tha/app/1modulefiles"
    module_list:
      - conda/py12lmpSevenn
    # source_list:
    #   - /etc/profile.d/modules.sh
    envs:
      NP: $SLURM_NTASKS
      OMP_NUM_THREADS: 1
      OMPI_MCA_opal_cuda_support: 1
      UCX_MEMTYPE_CACHE: n
      # CUDA_VISIBLE_DEVICES: ""  # disable GPU
  # command: "mpirun -np $NP lmp_mpi"
  submit_size: 4
  # work_load_ratio: 0.4  # ratio of total jobs to run on this machine.


#####ANCHOR SGE: Centos - LAMMPS
lammps_4:
  machine:
    batch_type: SGE
    context_type: SSH
    remote_root: /home1/tha/_job_tachyon/md
    retry_count: 0
    remote_profile:
      hostname: xxx.xxx.xxx.5
      username: tiny_bird
      password: little_finger
      port: 2225
      timeout: 20

  resources:

    queue_name: "ib.q,1g.q"
    # cpu_per_node: 8
    kwargs:
      # pe_name: mpi_8
      job_name: zal_md
    custom_flags:
      - "#$ -l h_rt=168:00:00"
      # - "#$ -l h=!(com001|com003|)"
    module_list:
      - mpi/openmpi4.1.7-clang17-IB
      - conda/py12sevenn
      - lammps/llvmOMPI4-sevenn
    source_list:
      - /etc/profile.d/modules.sh
    envs:
      OMP_NUM_THREADS: 1
      OMPI_MCA_btl_openib_allow_ib: 1

  # command: sevenn
  submit_size: 10
  # work_load_ratio: 0.6
##### !SECTION


#####SECTION DFT calculation
#####ANCHOR SLURM: Local
gpaw_1:
  machine:
    batch_type: Slurm
    context_type: SSH
    remote_root: /home/tha/_job_local/w24_WSL_dft
    retry_count: 0
    remote_profile:
      hostname: xxx.xxx.xxx.4
      username: tiny_bird
      password: little_finger
      port: 2225
      timeout: 20

  resources:

    number_node: 1
    cpu_per_node: 1
    gpu_per_node: 1
    custom_flags:
      - "#SBATCH --job-name=zfp"
      - "#SBATCH --time=168:00:00"
      - "#SBATCH --mem=22G"
      - "#SBATCH --partition=all"       # Partition name, run `sinfo` to get a list of partitions. Use to instead of queue_name
      - "#SBATCH --gres=shard:2"

      - "source /etc/profile.d/modules.sh"
      - "module use /home/tha/app/1modulefiles"
    module_list:
      - conda/py12gpaw_gpu
    # source_list:
    #   - /etc/profile.d/modules.sh
    envs:
      NP: $SLURM_NTASKS
      OMP_NUM_THREADS: 1
      OMPI_MCA_opal_cuda_support: 1
      UCX_MEMTYPE_CACHE: n
      GPAW_NEW: 1
      GPAW_USE_GPUS: 1
  # command: "mpirun -np $NP --bind-to core:overload-allowed gpaw python"
  command: "mpirun -np $NP gpaw python"
  submit_size: 2
  work_load_ratio: 0.25  # ratio of total jobs to run on this machine.


#####ANCHOR SLURM: Rocky
gpaw_2:
  machine:
    batch_type: Slurm
    context_type: SSH
    remote_root: /home/tha/_job_rocky/dft
    retry_count: 0
    remote_profile:
      hostname: xxx.xxx.xxx.2
      username: tiny_bird
      password: little_finger
      port: 2225
      timeout: 20

  resources:

    number_node: 1
    cpu_per_node: 1
    gpu_per_node: 1
    custom_flags:
      - "#SBATCH --job-name=zfp"
      - "#SBATCH --time=168:00:00"
      - "#SBATCH --mem=30G"
      - "#SBATCH --partition=all"       # Partition name, run `sinfo` to get a list of partitions. Use to instead of queue_name
      - "#SBATCH --gres=shard:2"

      - "source /etc/profile.d/modules.sh"
      - "module use /home/tha/app/1modulefiles"
    module_list:
      - conda/py12gpaw_gpu
    # source_list:
    #   - /etc/profile.d/modules.sh
    envs:
      NP: $SLURM_NTASKS
      OMP_NUM_THREADS: 1
      OMPI_MCA_opal_cuda_support: 1
      UCX_MEMTYPE_CACHE: n
      GPAW_NEW: 1
      GPAW_USE_GPUS: 1
  command: "mpirun -np $NP gpaw python"
  submit_size: 2
  work_load_ratio: 0.25  # ratio of total jobs to run on this machine.


#####ANCHOR SLURM: Ubuntu GPU
gpaw_3:
  machine:
    batch_type: Slurm
    context_type: SSH
    remote_root: /home/tha/_job_ubuntu/dft
    retry_count: 0
    remote_profile:
      hostname: xxx.xxx.xxx.3
      username: tiny_bird
      password: little_finger
      port: 2225
      timeout: 20

  resources:

    number_node: 1
    cpu_per_node: 1
    gpu_per_node: 1
    custom_flags:
      - "#SBATCH --job-name=zfp"
      - "#SBATCH --time=168:00:00"
      - "#SBATCH --mem=30G"
      - "#SBATCH --partition=all"       # Partition name, run `sinfo` to get a list of partitions. Use to instead of queue_name
      - "#SBATCH --gres=shard:2"

      - "source /etc/profile.d/modules.sh"
      - "module use /home/tha/app/1modulefiles"
    module_list:
      - conda/py12gpaw_gpu
    # source_list:
    #   - /etc/profile.d/modules.sh
    envs:
      NP: $SLURM_NTASKS
      OMP_NUM_THREADS: 1
      OMPI_MCA_opal_cuda_support: 1
      UCX_MEMTYPE_CACHE: n
      GPAW_NEW: 1
      GPAW_USE_GPUS: 1
  command: "mpirun -np $NP gpaw python"
  submit_size: 4
  # work_load_ratio: 0.4  # ratio of total jobs to run on this machine.
##### !SECTION