#!/bin/bash # Copyright (c) 2014 John Biddiscombe # Adapted from stuff found originally somewhere on the internet # # Distributed under the Boost Software License, Version 1.0. (See accompanying # file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) # This function writes a slurm script. # We can call it with different parameter # settings to create different experiments function write_script { JOB_NAME=$(printf 'hpx-N%04d-T%05d-t%02d-%s' ${NODES} ${TRANSFERSIZE} ${THREADS_PERTASK} ${PARCELTYPE}) DIR_NAME=$(printf 'hpx-N%04d-T%05d-t%02d-%s' ${NODES} ${TRANSFERSIZE} ${THREADS_PERTASK} ${PARCELTYPE}) TASKS_PER_NODE=1 if [ -d "$DIR_NAME" ]; then # Directory already exists, skip generation of this job echo "Exists already : Skipping $DIR_NAME" return 1 fi echo "Creating job $DIR_NAME" mkdir -p $DIR_NAME cat << _EOF_ > ${DIR_NAME}/submit-job.bash #!/bin/bash #SBATCH --job-name=${JOB_NAME} #SBATCH --output=slurm.out #SBATCH --error=slurm.err #SBATCH --nodes=${NODES} #SBATCH --time=${TIME} #SBATCH --exclusive #SBATCH --distribution=cyclic #SBATCH --constraint=gpu #SBATCH --partition=${QUEUE} ## #SBATCH --cpus-per-task=1 ## #SBATCH --dependency=singleton #======START===== module load slurm # # mvapich settings used at CSCS # not all are relevant for this test # export LD_LIBRARY_PATH=${LIB_PATH}:${LD_LIBRARY_PATH} # slurm launch command srun -n $[${PROCESSES_PERNODE} * $NODES] ${EXECUTABLE1} ${PROGRAM_PARAMS} _EOF_ # make the job script executable chmod 775 ${DIR_NAME}/submit-job.bash # create a script that launches the job and adds the jobid to a cancel jobs script echo "cd ${DIR_NAME}; JOB=\$(sbatch submit-job.bash) ; echo \"\$JOB\" ; echo \"\$JOB\" | sed 's/Submitted batch job/scancel/g' >> \$BASEDIR/cancel_jobs.bash; cd \$BASEDIR" >> run_jobs.bash } # get the path to this generate script, works for most cases pushd `dirname $0` > /dev/null BASEDIR=`pwd` popd > /dev/null echo "Generating jobs using base directory $BASEDIR" # Create another script to submit all generated jobs to the scheduler echo "#!/bin/bash" > run_jobs.bash echo "BASEDIR=$BASEDIR" >> run_jobs.bash echo "cd $BASEDIR" >> run_jobs.bash chmod 775 run_jobs.bash # # # MPIEXEC="@MPIEXEC@" EXECUTABLE1=@EXE_PATH@ LIB_PATH="@LIB_PATH@" JOB_OPTIONS1="@JOB_OPTIONS1@" TIME="00:10:00" PROCESSES_PERNODE=1 # Loop through all the parameter combinations generating jobs for each #for NODES in 2 4 8 16 32 64 128 256 512 1024 2048 4096 for NODES in 32 64 128 do if [ "$NODES" == "4096" ]; then QUEUE=large elif [ "$NODES" -lt "4" ]; then QUEUE=debug else QUEUE=normal fi # for PARCELTYPE in "mpi" "libfabric" for PARCELTYPE in "libfabric" do TCP_ENABLE="-Ihpx.parcel.tcp.enable=0" MPI_ENABLE="-Ihpx.parcel.mpi.enable=0" FAB_ENABLE="-Ihpx.parcel.libfabric.enable=0" BOOTSTRAP="-Ihpx.parcel.bootstrap=$PARCELTYPE" if [ "$PARCELTYPE" == "tcp" ]; then TCP_ENABLE="-Ihpx.parcel.tcp.enable=1" elif [ "$PARCELTYPE" == "mpi" ]; then MPI_ENABLE="-Ihpx.parcel.mpi.enable=1" elif [ "$PARCELTYPE" == "libfabric" ]; then FAB_ENABLE="-Ihpx.parcel.libfabric.enable=1" fi HPX_ARGS="-Ihpx.parcel.message_handlers=0 --hpx:bind=balanced " # -Ihpx.max_busy_loop_count=10 --hpx:attach-debugger=exception" for TRANSFERSIZE in 256 512 1024 2048 4096 8192 do for THREADS_PERTASK in 12 do LOCAL_SIZE=$(printf "%.0f" $( bc <<< "scale=6;(128 * $TRANSFERSIZE * $THREADS_PERTASK)/1024" )) LOCAL_SIZE=$(echo $((LOCAL_SIZE>1024?1024:LOCAL_SIZE))) PROGRAM_PARAMS="${BOOTSTRAP} ${TCP_ENABLE} ${MPI_ENABLE} ${FAB_ENABLE} --hpx:threads=${THREADS_PERTASK} ${HPX_ARGS} --localMB=${LOCAL_SIZE} --transferKB=${TRANSFERSIZE} --parceltype=${PARCELTYPE} --distribution=0 --all-to-all=1 --no-local=1 --iterations=100" write_script done done done done echo "echo \"Use find . -name \*.out -exec grep CSVData {} \;\" " >> run_jobs.bash