Schema for Active Learning Configuration¶

The schema to configure the input file for active learning process.

Schema:¶

### Schema for active learning configuration file

train:                      ### ANCHOR: Training ML model
  type: dict
  required: True
  schema:
    init_data_paths:          # List of paths to initial data.
      type: list
      required: True

    preprocess_data:          # Arguments for processing data
      type: dict
      schema:
        trainset_ratio:       # Ratio of training set. Default is 0.9
          type: float
        validset_ratio:       # Ratio of validation set. Default is 0.1
          type: float
        num_cores:            # number of cores for building graph data. Default is 1
          type: integer
        ase_kwargs:           # Custom keywords to access Energy/Force/Stress from ASE extxyz files. Default is empty dict, meaning use default ASE keywords.
          type: dict
          default: {}
          schema:
            energy_key:       # keyword for energy in ASE extxyz file. Default is 'energy'
              type: string
            force_key:        # keyword for force in ASE extxyz file. Default is 'forces'
              type: string
            stress_key:       # keyword for stress in ASE extxyz file. Default is 'stress'
              type: string

    continue_train:           # continue checkpoints from previous iteration. Default is True
      type: boolean

    num_grad_updates:         # Maximum number of updates to guess num_epochs. Default is None
      type: integer

    distributed:
      type: dict
      schema:
        distributed_backend:  # choices: 'mpi' or 'nccl'  'gloo'
          type: string
        cluster_type:         # choices: 'slurm' or 'sge'
          type: string
        gpu_per_node:         # only need in SGE batch type. Default is 1
          type: integer

    num_models:               # Number of models to train. Default is 1
      type: integer

    mlp_model:                # ML model type. Default is 'sevenn_mliap'. Choices: 'sevenn', 'sevenn_mliap'
      type: string
      default: 'sevenn_mliap'
      allowed: ['sevenn_mliap', 'sevenn']

    sevenn_args:              ### Args for training Sevenn. See: https://github.com/MDIL-SNU/SevenNet/blob/main/example_inputs/training/input_full.yml
      type: dict
      schema:
        model:
          type: dict
        train:
          type: dict
        data:                 # Only need parameters in this schema
          type: dict
          schema:
            batch_size:       # Per GPU batch size. E.g., 300
              type: integer
            shift:            # One of 'per_atom_energy_mean*', 'elemwise_reference_energies', float
              type: [string, float]
            scale:            # One of 'force_rms*', 'per_atom_energy_std', 'elemwise_force_rms', float
              type: [string, float]


md:                         ### ANCHOR: Run MD exploration
  type: dict
  required: True
  schema:
    committee_std:            # Committee standard deviation for each property
      type: dict
      required: True
      schema:
        e_std_lo:             # Lower bound of committee standard deviation for energy. E.g., 0.05
          type: float
          default: 0.0
        e_std_hi:             # Upper bound of committee standard deviation for energy. E.g., 0.15
          type: float
          default: 0.15
        f_std_lo:             # Lower bound of committee standard deviation for force. E.g., 0.05
          type: float
          default: 0.0
        f_std_hi:             # Upper bound of committee standard deviation for force. E.g., 0.15
          type: float
          default: 0.15
        s_std_lo:             # Lower bound of committee standard deviation for stress. E.g., 0.05
          type: float
          default: 0.0
        s_std_hi:             # Upper bound of committee standard deviation for stress. Default is None, meaning do not use stress criterion.
          type: float
        compute_stress:       # Whether to compute stress for stress criterion. Default is True
          type: boolean
          default: True
        rel_force:            # Whether to use relative force std (w.r.t. force magnitude) for force criterion. Default is None
          type: float
        rel_stress:           # Whether to use relative stress std (w.r.t. stress magnitude) for stress criterion. Default is None
          type: float

    checkpoint_conversion:    # Parameters for converting checkpoints to MD calculator format
      type: dict
      schema:
        checkpoint_idx:       # indices of models/checkpoints in the committee to be used in MD run.
          type: integer
        extra_kwargs:         # additional keyword arguments for conversion functions, depending on `mlp_model`. E.g., 'use_cueq'. 'use_flash', 'modal', etc.
          type: dict
          schema:
            use_cueq:         # whether to use charge equilibration. Default is False
              type: boolean
              default: False
            use_flash:        # whether to use FLASH formalism. Default is False
              type: boolean
              default: False
            modal:            # modal type for Sevenn model. Default is None
              type: string
              # allowed: ['single', 'multi']

    dftd3:                    # whether to add DFT-D3 correction. Default is None
      type: dict
      schema:
        damping:              # damping method. Default is "zero"
          type: string
          allowed: ['zero', 'bj']
        xc:           # exchange-correlation functional. Default is "pbe"
          type: string
          default: "pbe"
        cutoff:               # cutoff distance in Angstrom. Default value is 95bohr := 50 angstrom. Units 1 (Bohr radius) = 0.52917721 (Å)
          type: float
          default: 50.2
        cn_cutoff:            # coordination number cutoff distance in Angstrom. Default value is 40bohr := 21 angstrom.
          type: float
          default: 21.17
        d3package:           # D3 package to use. Choices: 'sevenn' or 'lammps'. Default is 'sevenn'
          type: string
          allowed: ['sevenn', 'lammps']
          default: 'sevenn'

    init_struct_paths:        # List of paths of initial structures. Accept wildcard, e.g., "path/to/dir/*.extxyz"
      type: list
      required: True

    md_calculator:                # MD calculator. Default is 'ase'. Choices: 'ase', 'lammps'
      type: string
      allowed: ['ase', 'lammps']

    common_md_args:           # define common parameters for MD. Accept all keyword fors [ASE MD schema](https://thangckt.github.io/alff_doc/schema/config_ase/)
      type: dict              # These common parameters will be overwritten by the parameters in `sampling_spaces`.
      allow_unknown: True
      schema:
        dt:                   # timestep in fs. Default is 1.0 fs
          type: float
        thermostat:           # define common thermostat. default is 'nose_hoover_chain'
          type: string
        barostat:             # define common barostat. default is 'iso_nose_hoover_chain'
          type: string


    sampling_spaces:          ### list of dicts of sampling spaces for MD run. For each dict:
                                # If set both 'stresses' and 'temperatures', it will use NPT ensemble.
                                # If set only 'temperatures', it will use NVT ensemble.
                                # If none of them, it will use NVE ensemble.
      type: list
      required: True
      schema:
        type: dict
        allow_unknown: True
        schema:
          init_struct_idxs:    # list of index of `init_struct_paths`, e.g., [0, 1, 2, 3]. Accept inputs as strings-of-range [1, 2, "6-10", "3-5:2"], which last item in ranges is inlusive.
            type: list
          temps:              # list of temperatures in K, e.g., [1, 300, 1500]
            type: list
          pressures:          # list of external stresses in GPa. Default is None if not set. Accept float, 6-vector, or 3x3 matrix.
            type: [float, list]
          # Other arguments as keywords for [ASE MD schema](https://thangckt.github.io/alff_doc/schema/config_ase/), except these keys: 'temperature', 'stress'.


dft:                        ### ANCHOR: DFT calculators
  type: dict
  required: True
  schema:
    calc_args:                 # accept all keywords for ['ase.calc'](https://thangckt.github.io/alff_doc/schema/config_ase/)
      type: dict
      required: True
      schema:
        gpaw:                  # GPAW calculator parameters
          type: dict
          required: True
        dftd3:                 # DFT-D3 calculator for Van der Waals correction
          type: dict
        exclude_result_dftd3:  # if True, exclude DFTD3 energy/force/stress from the final results. Default is False
          type: boolean
          default: False

Example config 1:¶

### Example configuration file for active learning with ALFF

#####ANCHOR Training
train:
  init_data_paths:
    - ../1_gendata/*/*/02_gendata/data_label.extxyz
    - ../1_gendata/1_iteration_data

  preprocess_data:
    trainset_ratio: 0.9
    validset_ratio: 0.1
    num_cores: 1       # number of cores for building graph data
    # ase_kwargs:
    #   energy_key: 'ref_energy'        # keyword for energy in ASE extxyz file. Default is 'energy'
    #   force_key: 'ref_forces'         # keyword for force in ASE extxyz file. Default is 'forces'
    #   stress_key: 'ref_stress'        # keyword for stress in ASE extxyz file. Default is 'stress'

  continue_train: True            # continue checkpoints from previous iteration. Default is True
  num_grad_updates: 100000       # Maximum number of updates to guess num_epochs. Default is None

  distributed:
    distributed_backend: 'nccl'     # choices: 'mpi' or 'nccl'  'gloo'
    cluster_type: 'slurm'           # choices: 'slurm' or 'sge'
    # gpu_per_node: 1               # only need in sge

  num_models: 4

  mlp_model: sevenn        # 'sevenn_mliap', 'sevenn',

  sevenn_args:  # Updated: Dec 17, 2024. See: https://github.com/MDIL-SNU/SevenNet/blob/main/example_inputs/training/input_full.yaml
    model:
      chemical_species: ['Mo', 'W', 'S', 'Se', 'Te']  # Elements model should know. [ 'Univ' | 'Auto' | manual_user_input ]
      cutoff: 5.0                                     # Cutoff radius in Angstroms. If two atoms are within the cutoff, they are connected.
      channel: 32                                   # The multiplicity(channel) of node features.
      lmax: 2                                       # Maximum order of irreducible representations (rotation order).
      num_convolution_layer: 4                      # The number of message passing layers.

      # irreps_manual:                               # Manually set irreps of the model in each layer (e.g., 128 channels + 5 layers)
        #- "128x0e"
        #- "128x0e+64x1e+32x2e"
        #- "128x0e+64x1e+32x2e"
        #- "128x0e+64x1e+32x2e"
        #- "128x0e+64x1e+32x2e"
        #- "128x0e"

      weight_nn_hidden_neurons: [64, 64]            # Hidden neurons in convolution weight neural network
      radial_basis:                                 # Function and its parameters to encode radial distance
        radial_basis_name: 'bessel'               # Only 'bessel' is currently supported
        bessel_basis_num: 8
      cutoff_function:                              # Envelop function, multiplied to radial_basis functions to init edge featrues
        cutoff_function_name: 'poly_cut'          # {'poly_cut' and 'poly_cut_p_value'} or {'XPLOR' and 'cutoff_on'}
        poly_cut_p_value: 6

      act_gate: {'e': 'silu', 'o': 'tanh'}          # Equivalent to 'nonlinearity_gates' in nequip
      act_scalar: {'e': 'silu', 'o': 'tanh'}        # Equivalent to 'nonlinearity_scalars' in nequip

      is_parity: False                              # Pairy True (E(3) group) or False (to SE(3) group)

      self_connection_type: linear                # Default is 'nequip'. 'linear' is used for SevenNet-0.
      interaction_type: nequip

      conv_denominator: "avg_num_neigh"             # Valid options are "avg_num_neigh*", "sqrt_avg_num_neigh", or float
      train_denominator: False                      # Enable training for denominator in convolution layer
      train_shift_scale: False                      # Enable training for shift & scale in output layer

    train:
      random_seed: 1
      train_shuffle: True
      is_train_stress: True                         # Includes stress in the loss function
      epoch: 3                                    # Ends training after this number of epochs
      per_epoch:  20                                # Generate checkpoints every this epoch

      # loss: 'Huber'                                # Default is 'mse' (mean squared error)
      # loss_param:
      #     delta: 0.01

      # Each optimizer and scheduler have different available parameters.
      # You can refer to sevenn/train/optim.py for supporting optimizer & schedulers
      optimizer: 'adam'                             # Options available are 'sgd', 'adagrad', 'adam', 'adamw', 'radam'
      optim_param:
        lr: 5.0e-4

      scheduler: linearlr
      scheduler_param:
        start_factor: 1.0
        total_iters: 3               #   {..epoch}
        end_factor: 1.0e-7

      # scheduler: 'reducelronplateau'            # One of 'steplr', 'multisteplr', 'exponentiallr', 'cosineannealinglr', 'reducelronplateau', 'linearlr'
      # scheduler_param:
      #     factor: 0.75
      #     patience: 2
      #     threshold: 5.0e-5   # only changes large than this value will be considered as a change
      #     min_lr: 1.0e-12      # minimum learning rate

      # scheduler: exponentiallr
      # scheduler_param:
      #     gamma: 0.95        # large gamma means slower decay

      force_loss_weight: 1.0                                  # Coefficient for force loss
      stress_loss_weight: 1.0e-4       #  1.0e-3  1.0e-6      # Coefficient for stress loss (to kbar unit), kbar = 0.1 GPa

      # ['target y', 'metric']
      # Target y: TotalEnergy, Energy, Force, Stress, Stress_GPa, TotalLoss
      # Metric  : RMSE, MAE, or Loss
      error_record:
        - ['Energy', 'RMSE']
        - ['Force', 'RMSE']
        - ['Stress', 'RMSE']
        # - ['Stress_GPa', 'RMSE']
        - ['Energy', 'Loss']
        - ['Force', 'Loss']
        - ['Stress', 'Loss']
        - ['TotalLoss', 'None']
      best_metric: TotalLoss


      ### THANG: do not use this, just set `init_checkpoints` above
      # Continue training model from given checkpoint, or pre-trained model checkpoint for fine-tuning
      #continue:
        #checkpoint: 'checkpoint_best.pth'       # Checkpoint of pre-trained model or a model want to continue training.
        #reset_optimizer: False                  # Set True for fine-tuning
        #reset_scheduler: False                  # Set True for fine-tuning

    data:
      batch_size: 280    # 250                     # Per GPU batch size.

      shift: 'per_atom_energy_mean'                # One of 'per_atom_energy_mean*', 'elemwise_reference_energies', float
      scale: 'force_rms'                           # One of 'force_rms*', 'per_atom_energy_std', 'elemwise_force_rms', float


##### ANCHOR: Run MD
md:
  committee_std:
    e_std_lo: 0.05
    e_std_hi: 0.15
    f_std_lo: 0.05
    f_std_hi: 0.15
    s_std_lo: 0.05
    s_std_hi: 0.15

  checkpoint_conversion:
    checkpoint_idx: 2                         # index of models used for MD

  dftd3:                              # add DFT-D3 correction during MD simulation
    damping: 'zero'                   # damping method. Choices: "damp_zero", "damp_bj". Default is "damp_zero"
    xc: 'pbe'                         # exchange-correlation functional. Default is "pbe"

  md_calculator: 'lammps'                 # 'lammps', 'ase'

  common_md_args:
    dt: 0.001                         # unit ASE: 1 fs;   LAMMPS: 0.001 ps
    thermostat: 'nose_hoover_chain'   # 'langevin', 'nose_hoover_chain'. Nose-Hoover chain only for LAMMPS.
    barostat: 'nose_hoover_chain'     # ASE:'parrinello_rahman'. LAMMPS: 'nose_hoover_chain'
    tdamp: 100
    pdamp: 1000


  init_struct_paths:
    ### MoX2 & WX2 bulk
    - 0init_struct/bulk_MoX2_2x2x1/MoS2_mx2_2H_02x02x01       # '0-5'  bulk MoX2
    - 0init_struct/bulk_MoX2_2x2x1/MoS2_mx2_1T_02x02x01
    - 0init_struct/bulk_MoX2_2x2x1/MoSe2_mx2_2H_02x02x01
    - 0init_struct/bulk_MoX2_2x2x1/MoSe2_mx2_1T_02x02x01
    - 0init_struct/bulk_MoX2_2x2x1/MoTe2_mx2_2H_02x02x01
    - 0init_struct/bulk_MoX2_2x2x1/MoTe2_mx2_1T_02x02x01      # 5

    - 0init_struct/bulk_WX2_2x2x1/WS2_mx2_2H_02x02x01         # '6-11'  bulk WX2
    - 0init_struct/bulk_WX2_2x2x1/WS2_mx2_1T_02x02x01
    - 0init_struct/bulk_WX2_2x2x1/WSe2_mx2_2H_02x02x01
    - 0init_struct/bulk_WX2_2x2x1/WSe2_mx2_1T_02x02x01
    - 0init_struct/bulk_WX2_2x2x1/WTe2_mx2_2H_02x02x01
    - 0init_struct/bulk_WX2_2x2x1/WTe2_mx2_1T_02x02x01        # 11

    ### MoX2 & WX2 layer
    - 0init_struct/layer_MoX2_2x2x1/MoS2_mx2_2H_02x02x01      # '24-29'  layer MoX2
    - 0init_struct/layer_MoX2_2x2x1/MoS2_mx2_1T_02x02x01
    - 0init_struct/layer_MoX2_2x2x1/MoSe2_mx2_2H_02x02x01
    - 0init_struct/layer_MoX2_2x2x1/MoSe2_mx2_1T_02x02x01
    - 0init_struct/layer_MoX2_2x2x1/MoTe2_mx2_2H_02x02x01
    - 0init_struct/layer_MoX2_2x2x1/MoTe2_mx2_1T_02x02x01     # 29

    - 0init_struct/layer_WX2_2x2x1/WS2_mx2_2H_02x02x01        # '30-35'  layer WX2
    - 0init_struct/layer_WX2_2x2x1/WS2_mx2_1T_02x02x01
    - 0init_struct/layer_WX2_2x2x1/WSe2_mx2_2H_02x02x01
    - 0init_struct/layer_WX2_2x2x1/WSe2_mx2_1T_02x02x01
    - 0init_struct/layer_WX2_2x2x1/WTe2_mx2_2H_02x02x01
    - 0init_struct/layer_WX2_2x2x1/WTe2_mx2_1T_02x02x01       # 35

  sampling_spaces:
    ### If set both stress and temperature, it will use NPT ensemble.
    ### If set only temperature, it will use NVT ensemble.
    ### If none of them, it will use NVE ensemble.
    - {}
    - {}
    ### MoX2 & WX2
    - init_struct_idxs: ['0-5','6-11','12-17','18-23', '24-29','30-35']  ##TODO BULK & LAYER: NVT
      equil_steps: 20000
      traj_freq: 5
      num_frames: 20
      temps: [1, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200]         # temperatures in K

    - init_struct_idxs: ['0-5','6-11','12-17','18-23', '24-29','30-35']  ##TODO BULK & LAYER: NPT xy
      equil_steps: 20000
      traj_freq: 5
      num_frames: 20
      temps: [1, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200]          # temperatures in K
      pressures: [0, -0.5]                                          # stresses in GPa
      mask: [1,1,0]                                           # disable z-direction barostat

    - init_struct_idxs: ['0-5','6-11','12-17','18-23']                   ##TODO BULK: NPT xyz
      equil_steps: 20000
      traj_freq: 5
      num_frames: 20
      temps: [1, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200]
      pressures: [0, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5]          # stresses in GPa
      mask: [1,1,1]
      deform_limit: 0.9


##### ANCHOR: DFT calculation
dft:
  calc_args:
    gpaw:                       ### accept GPAW parameters
      mode:
        name: 'pw'              # use PlaneWave method energy cutoff in eV
        ecut: 500
      xc: "PBE"                 # exchange-correlation functional
      kpts:
        density: 6
        gamma: False            # if not set `kpts`, then only Gamma-point is used
      parallel:
        sl_auto: True         # enable ScaLAPACK parallelization
        use_elpa: True        # enable Elpa eigensolver
        # augment_grids: True   # use all cores for XC/Poisson solver
        # gpu: True               # enable GPU acceleration

    dftd3:                      ### DFT-D3 method for Van der Waals correction
      damping: "d3zero"         # use DFT-D3 damping. Default is "d3zero" (zero-damping). Choices: "d3bj","d3zero","d3bjm","d3zerom","d3op".