Commit e7a837c0 authored by brdunn's avatar brdunn

Updating to new tools

parent 8a3f2ba1
......@@ -4,4 +4,4 @@
.Ruserdata
.project
.pydevproject
**./vscode
# slurm_sim_tools
tools for slurm simulator
Auxilary tools for automating population of SLURM database, Generate trace files, and configuration optimization tools.
FROM centos:7
LABEL desc="Slurm simulator made ready"
#Adding in the code from github to be able to start/stop mysql (and sshd?)
COPY cmd_start /sbin/
COPY cmd_stop /sbin/
# giving permissions to use the cmd from above
RUN \
chmod a+rwx /sbin/cmd_start && \
chmod a+rwx /sbin/cmd_stop && \
mkdir /install_files && \
useradd -d /home/slurm -ms /bin/bash slurm && \
usermod -aG wheel slurm && \
echo "slurm:slurm"|chpasswd && \
echo "Added slurm user" && \
yum -y install git && \
yum clean all
# getting file that installs all the R packages
COPY ./package_install.R /install_files
# creating all the directories needed for larger run command
USER slurm
RUN \
cd /home/slurm && \
mkdir slurm_sim_ws && \
cd slurm_sim_ws && \
mkdir sim && \
cd /home/slurm/slurm_sim_ws && \
git clone https://github.com/ubccr-slurm-simulator/slurm_sim_tools.git
USER root
# installing mysql (mariadb), python, and R, setting everything up all in one Run command
RUN \
yum -y install mariadb-server && \
yum -y install mariadb-devel && \
echo "Done installing Mariadb" && \
yum -y install gcc-c++ && \
yum -y install install epel-release && \
yum -y install python34 python34-libs python34-devel python34-numpy python34-scipy python34-pip && \
pip3 install pymysql && \
pip3 install pandas && \
echo "Python all installed" && \
yum -y install R R-Rcpp R-Rcpp-devel && \
yum -y install python-devel && \
yum -y install texlive-* && \
echo "R all installed" && \
Rscript /install_files/package_install.R && \
echo "Installed R packages" && \
yum -y install sudo && \
yum -y install wget && \
echo "Sudo, git, wget installed" && \
wget https://download2.rstudio.org/rstudio-server-rhel-1.1.453-x86_64.rpm && \
yum -y install rstudio-server-rhel-1.1.453-x86_64.rpm && \
yum -y install initscripts && \
echo "Rstudio server installed" && \
yum -y install openssh openssh-server openssh-clients openssl-libs && \
mkdir /var/run/sshd && \
ssh-keygen -t rsa -f /etc/ssh/ssh_host_rsa_key -N '' && \
ssh-keygen -t ecdsa -f /etc/ssh/ssh_host_ecdsa_key -N '' && \
ssh-keygen -t ed25519 -f /etc/ssh/ssh_host_ed25519_key -N '' && \
echo "Ssh installed" && \
chmod g+rw /var/lib/mysql /var/log/mariadb /var/run/mariadb && \
mysql_install_db && \
chown -R mysql:mysql /var/lib/mysql && \
cmd_start mysqld && \
mysql -e "create user 'slurm'@'localhost' identified by 'slurm';" && \
mysql -e "GRANT ALL PRIVILEGES ON *.* TO 'slurm'@'localhost' IDENTIFIED BY 'slurm';" && \
cmd_stop mysqld && \
yum clean all
# switch to slurm user so the next directories made are owned by slurm
USER slurm
# installing slurm simulator
RUN \
cd /home/slurm/slurm_sim_ws && \
git clone https://github.com/ubccr-slurm-simulator/slurm_simulator.git && \
cd slurm_simulator && \
cd .. && \
mkdir bld_opt && \
cd bld_opt && \
../slurm_simulator/configure --prefix=/home/slurm/slurm_sim_ws/slurm_opt --enable-simulator \
--enable-pam --without-munge --enable-front-end --with-mysql-config=/usr/bin/ --disable-debug \
CFLAGS="-g -O3 -D NDEBUG=1" && \
make -j install
# 8787 is the default port that rstudio server uses, so need to expose it to use it
EXPOSE 8787
USER root
COPY ./startup_file.sh /install_files
COPY ./initial_test.sh /install_files
COPY ./micro_cluster_setup.py /install_files
COPY ./micro_ws_config.sh /install_files
COPY ./populate_slurmdb.sh /install_files
COPY ./generate_job_trace.sh /install_files
COPY ./run_sim.sh /install_files
COPY ./check_results.R /install_files
# need to expose port 22 to allow for ssh to work properly
EXPOSE 22
# expose for mysql use
EXPOSE 3306
# back to root for easier permissions stuff
RUN \
chmod -R a+rwx /install_files
# sets cmd_start as entrypoint, then runs the startup file and the initial test file
ENTRYPOINT ["/sbin/cmd_start"]
CMD ["/install_files/startup_file.sh","/install_files/initial_test.sh"]
FROM centos:7
LABEL desc="Slurm simulator made ready"
#Adding in the code from github to be able to start/stop mysql (and sshd?)
COPY cmd_start /sbin/
COPY cmd_stop /sbin/
# giving permissions to use the cmd from above
RUN \
chmod a+rwx /sbin/cmd_start && \
chmod a+rwx /sbin/cmd_stop
# installing mysql (mariadb)
RUN \
yum -y update && \
yum -y install mariadb-server && \
yum -y install mariadb-devel && \
echo "Done installing Mariadb"
#Python (need to install gcc-c++ to get the gcc command to work down the road with pandas
RUN \
yum -y install gcc-c++ && \
yum -y install install epel-release && \
yum -y install python34 python34-libs python34-devel python34-numpy python34-scipy python34-pip && \
pip3 install pymysql && \
pip3 install pandas && \
echo "Python all installed"
#Installing R
RUN \
yum -y install R R-Rcpp R-Rcpp-devel && \
yum -y install python-devel && \
yum -y install texlive-* && \
echo "R all installed"
# Installing some additional things to do things down the line
# sudo - to allow slurm to give root commands if needed
# git - to get stuff from github
# wget - to be able to get rstudio server on the machine
# Adding the slurm user to the system - need to install sudo for sudo commands..?
RUN \
yum -y install sudo && \
yum -y install git && \
yum -y install wget
# setting up directories and adding slurm user
RUN \
mkdir /install_files && \
useradd -d /home/slurm -ms /bin/bash slurm && \
usermod -aG wheel slurm && \
echo "slurm:slurm"|chpasswd && \
echo "Added slurm user"
# switch to slurm user so the next directories made are owned by slurm
USER slurm
# making the directory for slurm simulations, the install files (R script) and installing toolkit
RUN \
cd /home/slurm && \
mkdir slurm_sim_ws && \
cd slurm_sim_ws && \
mkdir sim && \
cd /home/slurm/slurm_sim_ws && \
git clone https://github.com/nsimakov/slurm_sim_tools.git
# switch back to root user so can have all access needed
USER root
# copies over the R script that installs all the packages for R into the installFiles in the root directory
COPY ./package_install.R /install_files
#Runs the R script to install the packages
RUN \
Rscript /install_files/package_install.R && \
echo "Installed packages"
#This installs Rstudio Server
RUN \
wget https://download2.rstudio.org/rstudio-server-rhel-1.1.453-x86_64.rpm && \
yum -y install rstudio-server-rhel-1.1.453-x86_64.rpm && \
yum -y install initscripts
# 8787 is the default port that rstudio server uses, so need to expose it to use it
EXPOSE 8787
# installs ssh and makes some keys for it
RUN \
yum -y install openssh openssh-server openssh-clients openssl-libs && \
mkdir /var/run/sshd && \
ssh-keygen -t rsa -f /etc/ssh/ssh_host_rsa_key -N '' && \
ssh-keygen -t ecdsa -f /etc/ssh/ssh_host_ecdsa_key -N '' && \
ssh-keygen -t ed25519 -f /etc/ssh/ssh_host_ed25519_key -N ''
# need to expose port 22 to allow for ssh to work properly
EXPOSE 22
# configuring mysqld (get it up and running and add the slurm user)
RUN \
chmod g+rw /var/lib/mysql /var/log/mariadb /var/run/mariadb && \
mysql_install_db && \
chown -R mysql:mysql /var/lib/mysql && \
cmd_start mysqld && \
mysql -e "create user 'slurm'@'localhost' identified by 'slurm';" && \
mysql -e "GRANT ALL PRIVILEGES ON *.* TO 'slurm'@'localhost' IDENTIFIED BY 'slurm';" && \
cmd_stop mysqld
# expose for mysql use
EXPOSE 3306
# slurm user switch again
USER slurm
# installing the slurm simulator from github - making appropriate directories as well
RUN \
cd /home/slurm/slurm_sim_ws && \
git clone https://github.com/nsimakov/slurm_simulator.git && \
cd slurm_simulator && \
cd .. && \
mkdir bld_opt && \
cd bld_opt && \
../slurm_simulator/configure --prefix=/home/slurm/slurm_sim_ws/slurm_opt --enable-simulator \
--enable-pam --without-munge --enable-front-end --with-mysql-config=/usr/bin/ --disable-debug \
CFLAGS="-g -O3 -D NDEBUG=1" && \
make -j install
# back to root for easier permissions stuff
USER root
# copies over files used in startup
COPY ./startup_file.sh /install_files
COPY ./initial_test.sh /install_files
COPY ./micro_cluster_setup.py /install_files
COPY ./micro_ws_config.sh /install_files
COPY ./populate_slurmdb.sh /install_files
COPY ./generate_job_trace.sh /install_files
COPY ./run_sim.sh /install_files
COPY ./check_results.R /install_files
# give rwx permissions of install_files directory to everyone so can run things
RUN \
chmod -R a+rwx /install_files
# need to expose this for the slurmdbd to work maybe?
#EXPOSE 29001
# sets cmd_start as entrypoint, then runs the startup file and the initial test file
ENTRYPOINT ["/sbin/cmd_start"]
CMD ["/install_files/startup_file.sh","/install_files/initial_test.sh"]
#!/usr/bin/env Rscript
# This script gets the results from the simulation and runs some tests on them
# It tests if the requested features were given to the jobs
# Features: cpu type, gpu, big mem
# How implemented - each feature corresponds to a different type of node
library(RSlurmSimTools) # needs these libraries to run the tests
library(dplyr)
# this function allows easier comparison to see if the simulator assigned things correctly
# checks the trace value (requested feature) against the sacct value (Node List, indicating assigning of a feature)
# the check values are there for reuse of the function for more than one type of test
check_nodes <- function(df.joined, row_num, trace_col, trace_check, sacct_col, sacct_check){
result = TRUE # assumes correct
# df.joined is the joined data frame from trace and sacct data frames
trace_val = df.joined[row_num, trace_col] # trace value (feature)
sacct_val = df.joined[row_num, sacct_col] # sim value (if implemented feature)
# no feature requested if the value is NA, so check for that
if(!is.na(trace_val))
{
# check if the feature (trace_check) was requested
if(trace_val == trace_check)
{
# checks if the node list has the node corresponding to that feature
if(!(grepl(sacct_check, sacct_val)))
{
# if improper nodes have been assigned, its a false result (didn't assign properly)
result = FALSE
}
}
}
result # result is returned
}
# reads in the csv file of the job traces (jobs submitted)
job_trace <- read.csv(file="/home/slurm/slurm_sim_ws/slurm_sim_tools/reg_testing/micro_cluster/test_trace.csv")
# reads in log file of resulting data (what jobs were assigned, where, etc)
sacct_base <- read_sacct_out("/home/slurm/slurm_sim_ws/sim/micro/baseline/results/jobcomp.log")
# creating a joined data frame by job id so that can go through jobs easier
joined <- left_join(job_trace, sacct_base, by = c("sim_job_id" = "local_job_id") )
done_well = TRUE # assumes did correctly
# loops through each row in the joined data frame
for(row in 1:nrow(joined))
{
# checks if all features have been met (or weren't present)
done_well = check_nodes(joined, row, "sim_req_mem", 500000, "NodeList", "b") && # big mem
check_nodes(joined, row, "sim_features", "CPU-M", "NodeList", "m") && # M cpu
check_nodes(joined, row, "sim_features", "CPU-N", "NodeList", "n") && # N cpu
check_nodes(joined, row, "sim_gres", "gpu:1", "NodeList", "g") && # 1 gpu
check_nodes(joined, row, "sim_gres", "gpu:2", "NodeList", "g") # 2 gpu
# if at any point a feature doesn't match, breaks out of the loop
if(!done_well)
{
# prints out the job id for tracing back what failed
jobid = joined[row, "sim_job_id"]
print(paste("Id of incorrectly assigned job:", jobid))
break
}
}
# prints overall result
print("Did the simulator do well?.....")
print(done_well)
#!/bin/bash
echo "Reach Entry Point"
echo $$ > /var/run/enrypoint.pid
set -e
loop=0
run_bash=0
start_process(){
name=$1
command=$2
pid_file=$3
if [ ! -f "${pid_file}" ]; then
echo "Launching ${name}"
${command}
elif [ ! -f "/proc/`cat ${pid_file}`" ]; then
echo "Launching ${name}"
${command}
else
echo "${name} already running"
fi
}
# Start process and confirm it launches by looking for a
# confirm_sentence in log_file.
# Format:
# start_process_w_confirm name command pid_file log_file confirm_sentence
start_process_w_confirm(){
name=$1
command=$2
pid_file=$3
log_file=$4
confirm_sentence=$5
timeout_time=2
if [ -f "${pid_file}" ]; then
if [ -d "/proc/`cat ${pid_file}`" ]; then
echo "${name} already running"
return 1
fi
fi
if [ -f "${log_file}" ]; then
cat "${log_file}" >> "${log_file}.old"
rm "${log_file}"
fi
echo "Launching ${name}"
${command}
for ((i=0; i < ${timeout_time}; i++))
{
if grep -q ${confirm_sentence} ${log_file} ; then
echo "${name} is up"
return 0
fi
sleep 1
}
echo "Something wrong with ${name}, can not find key-phrase in log"
return 1
}
for var in "$@"
do
case "$var" in
mysqld)
echo "Launching mysqld"
trap "/sbin/shutdown.sh mysqld; exit" SIGHUP SIGINT SIGTERM
mysqld_safe &
mysqladmin --silent --wait=30 ping
;;
munged)
start_process munged munged /run/munge/munged.pid
;;
sshd)
start_process sshd /usr/sbin/sshd /run/sshd.pid
;;
slurmdbd)
start_process_w_confirm slurmdbd /usr/sbin/slurmdbd /run/slurmdbd.pid \
/var/log/slurm/slurmdbd.log started
;;
slurmctld)
start_process_w_confirm slurmctld /usr/sbin/slurmctld /run/slurmctld.pid \
/var/log/slurm/slurmctld.log started
;;
slurmd)
start_process_w_confirm slurmd /usr/sbin/slurmd /run/slurmd.pid \
/var/log/slurm/slurmd.log started
;;
self_contained_slurm_wlm)
cmd_start munged sshd mysqld slurmdbd slurmctld slurmd
;;
bash)
echo "Launching bash"
run_bash=1
;;
-loop)
loop=1
;;
-set-no-exit-on-fail)
set +e
;;
-set-exit-on-fail)
set -e
;;
*)
echo "Executing ${var}"
${var}
;;
esac
done
if [ $run_bash -eq 1 ]; then
/bin/bash
elif [ $loop -eq 1 ]; then
echo "All requested daemon launched"
while true; do
sleep 60
done
fi
#!/bin/bash
kill_process(){
kill -SIGTERM $1
while [ -f /proc/$1 ];do sleep 1;done
}
stop_process(){
name=$1
pid_file=$2
if [ -f "${pid_file}" ]; then
echo "Stopping ${name}"
kill_process `cat ${pid_file}`
else
echo "${name} is not running"
fi
}
for var in "$@"
do
case "$var" in
mysqld)
echo "Stopping mysqld"
mysqladmin shutdown
;;
munged)
stop_process munged /run/munge/munged.pid
;;
sshd)
stop_process sshd /run/sshd.pid
;;
slurmdbd)
stop_process slurmdbd /run/slurmdbd.pid
;;
slurmctld)
stop_process slurmctld /run/slurmctld.pid
;;
slurmd)
stop_process slurmd /run/slurmd.pid
;;
self_contained_slurm_wlm)
cmd_stop slurmd slurmctld slurmdbd mysqld sshd munged
;;
*)
echo "unknown command ${var}"
;;
esac
done
#!/bin/bash
# This script sets up and runs the generation of the job trace files using Rscript
# goes to the directory where we want to edit things in (working directory)
cd /home/slurm/slurm_sim_ws/slurm_sim_tools/reg_testing/micro_cluster/
# begins the R script that generates a bunch of test jobs for the simulator
Rscript 12_prep_jobs_for_testrun.R
#!/bin/bash
# this script sets up the micro cluster simulation and runs it, checking if it works properly
echo "Setting up Micro Cluster simulation...."
# creates and uses mysql database needed for the simulation
mysql -e "CREATE DATABASE slurm_micro2sim;"
mysql -e "USE slurm_micro2sim;"
# calls the setup file for the micro Cluster simulation (executes as slurm)
su slurm -c /install_files/micro_cluster_setup.py
echo "Done with Micro Cluster Setup"
echo "Starting simulation...."
# runs the simulation as the slurm user so the simulator doesn't get upset
su slurm -c /install_files/run_sim.sh
echo "Simulation Finished."
echo "Starting R check file....."
# this file runs some code that checks if features were given correctly
Rscript /install_files/check_results.R
cd /home/slurm # goes to the home directory of slurm
su slurm # switches to slurm user at the end (starts bash)
#!/usr/bin/env python3
#This python script sets up the micro Cluster simulation to run by calling other scripts
# using these to work with calling files to do things
import os
import subprocess
from time import sleep,time # need sleep to do the sleep(3) after dbd
# this function starts a process and then waits for it to finish before being done
def start_finish_process(file_path):
proc = subprocess.Popen(args=file_path)
proc.wait() # wait for process to finish
print("Finished process of: " + file_path)
return proc
# function to start up the slurmdbd
def startup_slurmdbd(dbd_loc, conf_loc):
proc = subprocess.Popen(args=[dbd_loc, "-Dvvv"], env={"SLURM_CONF": conf_loc} ) # runs the dbd in environment with the SLURM_CONF variable set
sleep(3) # sleeps to allow for spin up time
print("Started up the Slurmdbd")
return proc