From 6b0dc88d7cc7a26e2e616e6704653910fa0bd998 Mon Sep 17 00:00:00 2001
From: Rasmus Ringdahl <rasmus.ringdahl@liu.se>
Date: Thu, 6 Feb 2025 09:47:14 +0100
Subject: [PATCH] refactor: streamline example 1

---
 1_single_core_job/README.md                   | 34 ---------
 1_single_core_job/single_core_job.sh          | 12 ----
 1_single_core_job/single_core_task.py         | 37 ----------
 1_single_job_step/README.md                   | 49 +++++++++++++
 1_single_job_step/parallel_task.py            | 69 +++++++++++++++++++
 1_single_job_step/sequential_task.py          | 63 +++++++++++++++++
 1_single_job_step/single_job_step_parallel.sh | 13 ++++
 .../single_job_step_sequential.sh             | 13 ++++
 8 files changed, 207 insertions(+), 83 deletions(-)
 delete mode 100644 1_single_core_job/README.md
 delete mode 100644 1_single_core_job/single_core_job.sh
 delete mode 100644 1_single_core_job/single_core_task.py
 create mode 100644 1_single_job_step/README.md
 create mode 100644 1_single_job_step/parallel_task.py
 create mode 100644 1_single_job_step/sequential_task.py
 create mode 100644 1_single_job_step/single_job_step_parallel.sh
 create mode 100644 1_single_job_step/single_job_step_sequential.sh

diff --git a/1_single_core_job/README.md b/1_single_core_job/README.md
deleted file mode 100644
index acda651..0000000
--- a/1_single_core_job/README.md
+++ /dev/null
@@ -1,34 +0,0 @@
-# Single core jobs
-A single core job is a job with only a single thread. This type of job is used when it is hard or impossible to make use of multiple cores/threads. A simple example could be a data parser that reads a file and transforms it into a more suitable format.
-
-## How to run
-To run the example do the following steps:
-1. Log in to Lundgren
-2. Change directory to the example code
-3. Run `sbatch single_core_job.sh`
-4. Check queue status by running `squeue`
-5. When the job is completed check the file _single_core_job.log_
-
-## Detailed description of the example
-The batch script is the main file for the job allocation and preparation. Inside the python script a few environmental variables are fetched and printed out.
-
-### The batch script
-The batch script, _single_core_job.sh_, contains three sections. The first section contains input arguments to the Slurm scheduler. The second section loads Python into environment so it is accessible and lastly the a job step is performed.
-
-The input arguments are defined with a comment beginning with SBATCH followed by the argument key and value. For easier readablility the -- method is used.
-
-- __job-name:__ The name of the job is set to _demo_single_core_
-- __time:__ The requeted time is set to 1 minute, _00:01:00_
-- __ntasks:__ The number of tasks to be performed in this job is set to _1_.
-- __cpus-per-task:__ The requested number of cores per task is set to _1_
-- __mem:__ The requested memory is set to _50 MB_
-- __output:__ The standard output should be sent to the file _single_core_job.log_
-
-Python needs to be loaded into the environment in order to be accessible this is done in the next step with the __module__ command.
-
-The job step with the single task is allocated and performed with the __srun__ command.
-
-#### The python script
-The python script represents the taskt to be done. In this case the task is to print out some environment variables that are set by Slurm.
-
-The environment variable __JOB_ID__ can be used to create temporary files and folders. In this example it creates a file named _<JOB_ID>_.txt and writes the job name into it.
\ No newline at end of file
diff --git a/1_single_core_job/single_core_job.sh b/1_single_core_job/single_core_job.sh
deleted file mode 100644
index e0bd298..0000000
--- a/1_single_core_job/single_core_job.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#! /bin/bash
-#SBATCH --job-name=demo_single_core
-#SBATCH --time=00:01:00
-#SBATCH --ntasks=1
-#SBATCH --cpus-per-task=1
-#SBATCH --mem=50MB
-#SBATCH --output=single_core_job.log
-
-# Loading Python into the environment
-module load python/anaconda3-2024.02-3.11.7
-
-srun python single_core_task.py
\ No newline at end of file
diff --git a/1_single_core_job/single_core_task.py b/1_single_core_job/single_core_task.py
deleted file mode 100644
index 9e8e26d..0000000
--- a/1_single_core_job/single_core_task.py
+++ /dev/null
@@ -1,37 +0,0 @@
-from datetime import datetime
-
-import logging
-import os
-import time
-
-logger = logging.getLogger(__name__)
-
-def main():
-    # Read environment variables.
-    JOB_NAME = os.environ.get('SLURM_JOB_NAME','Unknown')
-    JOB_ID = os.environ.get('SLURM_JOB_ID','Unknown')
-    NUMBER_OF_CORES = os.environ.get('SLURM_CPUS_PER_TASK','Unknown')
-    MAXIMUM_MEMORY = os.environ.get('SLURM_MEM_PER_NODE','Unknown')
-
-    # Sleeping until next minute.
-    # This represents the calculations
-    current_time = datetime.now()
-    sleep_time = 60 - current_time.second
-    logger.info('%s - Sleeping for %d seconds.',current_time.strftime('%Y-%m-%d %H:%M:%S'), sleep_time)
-    time.sleep(sleep_time)
-
-    # Printing some things to standard output.
-    logger.info('\nJob ID:\t\t\t%s\nJob name:\t\t%s\nAllocated cores:\t%s\nAllocated memory:\t%s',
-        JOB_ID, JOB_NAME, NUMBER_OF_CORES,MAXIMUM_MEMORY)
-
-    # Writing some output to a file based on the Slurm job id.
-    output_file = '{}.txt'.format(JOB_ID)
-    with open(output_file,'w') as file:
-        file.write('This file was created by the job {} with id {}\n'.format
-                   (JOB_NAME, JOB_ID))
-        
-    logger.info('Job completed.')
-
-if __name__ == '__main__':
-    logging.basicConfig(level=logging.INFO)
-    main()
diff --git a/1_single_job_step/README.md b/1_single_job_step/README.md
new file mode 100644
index 0000000..8261b61
--- /dev/null
+++ b/1_single_job_step/README.md
@@ -0,0 +1,49 @@
+# Single job step
+In SLURM, Job Steps are a way to launch distinct parallel (most commonly) and/or sequential tasks from within a single job script. Job Steps are executed using the SLURM command "srun"
+
+This is an example where Slurm is instructed to run a single job step. A single job step is fast to write and simple to use.
+
+This folder contains one batch script that runs a sequential program and one batch sript that runs a parallel program. The batch scripts it self, that defines the job steps, are similar with only slight modifications in the settings.
+
+## How to run
+To run the example do the following steps:
+1. Log in to Lundgren
+2. Change directory to the example code
+3. Run `sbatch single_job_step_sequential.sh` or `sbatch single_job_step_parallel.sh`
+4. Check queue status by running `squeue`
+5. When the job is completed check the file _sequential_single_job_step.log_ or _parallel_single_job_step.log_ for the program log.
+
+_If you run the parallel example, try change the batch file to run on 1 and or 4 cores._
+
+## Detailed description of the example
+The batch script is the main file for the job allocation and preparation. Inside the python script a few environmental variables are fetched and printed out.
+
+### The batch script
+The batch script, _single_job_step_sequential.sh_ / _single_job_step_parallel.sh_ contains three sections. The first section contains input arguments to the Slurm scheduler. The second section loads Python into environment so it is accessible and lastly the a job step is performed.
+
+The input arguments are defined with a comment beginning with SBATCH followed by the argument key and value. For easier readablility the -- method is used.
+
+- __job-name:__ The name of the job
+- __time:__ The requeted time
+- __ntasks:__ The number of tasks to be performed in this job
+- __cpus-per-task:__ The requested number of cpus per task
+- __mem-per-cpu:__ The requested memory adjusted per the number of cpu's
+- __output:__ File name for standard output
+
+Python needs to be loaded into the environment in order to be accessible this is done in the next step with the __module__ command.
+
+The single job step is allocated and performed with the __srun__ command.
+
+#### The python script
+The python script represents the taskt to be done. In this case the task is read an input file and wait to simulate a calculation and afterwards print to an output file.
+
+- The environment variable __JOB_ID__ can be used to create temporary files and folders.
+- The environment variable __SLURM_CPUS_PER_TASK__ is used to restrict the worker pool to the allocated number of cpus when running in parallel.
+
+### How to set the number of cores in different programing languages and softwares
+Most programming languages and softwares tries to make use of all cores that are available. This can lead to an oversubscription on the resources. On a shared resource one must match the maximum used resources with the allocated ones. This section gives a reference in how to do it in commonly used softwares.
+
+- __CPLEX:__ Use the parameter _global thread count_. Read more in the [documentation](https://www.ibm.com/docs/en/icos/22.1.2?topic=parameters-global-thread-count)
+- __Gurobi:__ Use the configuration parameter _ThreadLimit_. Read more in the [documentation](https://docs.gurobi.com/projects/optimizer/en/current/reference/parameters.html#threadlimit)
+- __MATLAB:__ Create a instance of the parpool object with the _poolsize_ set to the number of cores and use the pool when running in parallell. Read more in the [documentation](https://se.mathworks.com/help/parallel-computing/parpool.html)
+- __Python:__  If the multiprocessing package is used, create an instance of the pool class with the _processes_ set to the number of cores and use the pool when running in parallell. Read more in the [documentation](https://docs.python.org/3/library/multiprocessing.html#multiprocessing.pool.Pool)
\ No newline at end of file
diff --git a/1_single_job_step/parallel_task.py b/1_single_job_step/parallel_task.py
new file mode 100644
index 0000000..e579a6f
--- /dev/null
+++ b/1_single_job_step/parallel_task.py
@@ -0,0 +1,69 @@
+from datetime import datetime
+from multiprocessing import Pool
+
+import json
+import logging
+import os
+import sys
+import time
+
+logger = logging.getLogger(__name__)
+
+def sleep(input) -> int:
+    time.sleep(input[1])
+    logger.info('Task %d done.',input[0])
+
+    return input[1]
+
+def main(input_file: str, output_file: str):
+    # Read environment variables.
+    JOB_NAME = os.environ.get('SLURM_JOB_NAME','Unknown')
+    JOB_ID = os.environ.get('SLURM_JOB_ID','Unknown')
+    NUMBER_OF_CPUS = os.environ.get('SLURM_CPUS_PER_TASK','Unknown')
+    if NUMBER_OF_CPUS in 'Unknown':
+        logger.error('Unkown number of CPU''s, exiting.')
+        return
+
+    NUMBER_OF_CPUS = int(NUMBER_OF_CPUS)
+    logger.info('**** Output for job %s (%s) ****', JOB_NAME, JOB_ID)
+    logger.info('Running program with %d CPU''s.',NUMBER_OF_CPUS)
+
+    # Reading configuration file and create a list of tasks
+    # This represents the reading of parameters and calculations
+    logger.info('Reading configuration from %s.',input_file)
+    with open(input_file, 'r') as file:
+        data = json.load(file)
+    
+    tasks = []
+    total_time = 0
+    for i in range(len(data['sleep'])):
+        time = data['sleep'][i]
+        tasks.append((i, time))
+        total_time = total_time + time
+
+    # Creating a multiprocessing pool to perform the tasks
+    pool = Pool(processes=NUMBER_OF_CPUS)
+
+    # Running submitting the tasks to the worker pool
+    tic = datetime.now()
+    logger.info('Submitting tasks to pool.')
+    results = pool.map(sleep, tasks)
+    toc = datetime.now()
+
+    logger.info('All tasks are done, took %d seconds, compared to %d seconds with single thread.',
+        (toc-tic).seconds, total_time)
+    
+    logger.info('Writing result to %s', output_file)
+    with open(output_file, 'w') as file:
+        file.write('time\n')
+        for result in results:
+            file.write('{}\n'.format(result))
+
+    logger.info('Done.')
+
+if __name__ == '__main__':
+    logging.basicConfig(level=logging.INFO)
+    input_file = sys.argv[1]
+    output_file = sys.argv[2]
+    main(input_file, output_file)
+    sys.exit(0)
diff --git a/1_single_job_step/sequential_task.py b/1_single_job_step/sequential_task.py
new file mode 100644
index 0000000..0862c07
--- /dev/null
+++ b/1_single_job_step/sequential_task.py
@@ -0,0 +1,63 @@
+from datetime import datetime
+from multiprocessing import Pool
+
+import json
+import logging
+import os
+import sys
+import time
+
+logger = logging.getLogger(__name__)
+
+def sleep(input) -> int:
+    time.sleep(input[1])
+    logger.info('Task %d done.',input[0])
+
+    return input[1]
+
+def main(input_file: str, output_file: str):
+    # Read environment variables.
+    JOB_NAME = os.environ.get('SLURM_JOB_NAME','Unknown')
+    JOB_ID = os.environ.get('SLURM_JOB_ID','Unknown')
+
+    logger.info('**** Output for job %s (%s) ****', JOB_NAME, JOB_ID)
+    logger.info('Running program sequential.')
+
+    # Reading configuration file and create a list of tasks
+    # This represents the reading of parameters and calculations
+    logger.info('Reading configuration from %s.',input_file)
+    with open(input_file, 'r') as file:
+        data = json.load(file)
+    
+    tasks = []
+    results = []
+    total_time = 0
+    for i in range(len(data['sleep'])):
+        time = data['sleep'][i]
+        tasks.append((i, time))
+        total_time = total_time + time
+
+    # Running submitting the tasks in sequence.
+    tic = datetime.now()
+    logger.info('Running tasks sequentially.')
+    for task in tasks:
+        results.append(sleep(task))
+    toc = datetime.now()
+
+    logger.info('All tasks are done, took %d seconds, compared to %d seconds with single thread.',
+        (toc-tic).seconds, total_time)
+    
+    logger.info('Writing result to %s', output_file)
+    with open(output_file, 'w') as file:
+        file.write('time\n')
+        for result in results:
+            file.write('{}\n'.format(result))
+
+    logger.info('Done.')
+
+if __name__ == '__main__':
+    logging.basicConfig(level=logging.INFO)
+    input_file = sys.argv[1]
+    output_file = sys.argv[2]
+    main(input_file, output_file)
+    sys.exit(0)
diff --git a/1_single_job_step/single_job_step_parallel.sh b/1_single_job_step/single_job_step_parallel.sh
new file mode 100644
index 0000000..70be4b0
--- /dev/null
+++ b/1_single_job_step/single_job_step_parallel.sh
@@ -0,0 +1,13 @@
+#! /bin/bash
+#SBATCH --job-name=parallel_single_job_step
+#SBATCH --time=00:02:00
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=2
+#SBATCH --mem-per-cpu=50MB
+#SBATCH --output=parallel_single_job_step.log
+
+# Loading Python into the environment
+module load python/anaconda3-2024.02-3.11.7
+
+# Start job stage
+srun python parallel_task.py ../data/data_1.txt output_paralell.csv
\ No newline at end of file
diff --git a/1_single_job_step/single_job_step_sequential.sh b/1_single_job_step/single_job_step_sequential.sh
new file mode 100644
index 0000000..133033d
--- /dev/null
+++ b/1_single_job_step/single_job_step_sequential.sh
@@ -0,0 +1,13 @@
+#! /bin/bash
+#SBATCH --job-name=sequential_single_job_step
+#SBATCH --time=00:02:00
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=1
+#SBATCH --mem-per-cpu=50MB
+#SBATCH --output=sequential_single_job_step.log
+
+# Loading Python into the environment
+module load python/anaconda3-2024.02-3.11.7
+
+# Start job stage
+srun python sequential_task.py ../data/data_1.txt output_output_sequential.csv
\ No newline at end of file
-- 
GitLab