Add new file

ccb1f945 · Chinthak Murali · d3363c74 · ccb1f945
Commit ccb1f945 authored 7 months ago by Chinthak Murali
--- a/Week 10: OpenMP / MPI Programming and Benchmarking
+++ b/Week 10: OpenMP / MPI Programming and Benchmarking
+Mentor: Peng
+
+Training Topics
+Pre-require:
+
+1. understand the concept of OpenMP and MPI, multiple-thread vs multiple-process 
+
+OpenMP: 
+
+Use OpenMP when working on a shared memory system and the parallelism can be easily expressed using threads. 
+It is often simpler to implement and can be very efficient for multi-core processors.
+
+MPI: 
+
+Use MPI for large-scale distributed computing across multiple nodes. 
+It is essential for applications that require high scalability and involve complex communication patterns among processes.
+
+
+
+Multiple Threads
+
+Threads are the smallest unit of execution within a process. 
+Multiple threads within a single process share the same memory space and can communicate more efficiently than separate processes. 
+Each thread has its own stack but shares code, data, and file descriptors with other threads in the same process.
+
+Multiple Processes
+
+Processes are independent execution units that have their own memory space. 
+Multiple processes can run concurrently on different CPUs or cores and communicate via inter-process communication (IPC) mechanisms like pipes, sockets, shared memory, or message passing.
+
+
+2. create scripts to run OpenMP helloworld and an example for parallel loop 
+
+
+
+
+#include <omp.h>
+#include <stdio.h>
+
+int main() {
+    // Parallel region
+    #pragma omp parallel
+    {
+        int thread_id = omp_get_thread_num();
+        printf("Hello World from thread %d\n", thread_id);
+    }
+    return 0;
+}
+
+
+
+gcc -fopenmp hello_openmp.c -o hello_openmp
+./hello_openmp
+
+
+This script demonstrates how to use OpenMP to parallelize a loop that calculates the square of each element in an array.
+
+#include <omp.h>
+#include <stdio.h>
+
+#define N 100
+
+int main() {
+    int i;
+    int array[N];
+
+    // Initialize the array
+    for (i = 0; i < N; i++) {
+        array[i] = i;
+    }
+
+    // Parallelize this loop with OpenMP
+    #pragma omp parallel for
+    for (i = 0; i < N; i++) {
+        array[i] = array[i] * array[i];
+    }
+
+    // Print the results
+    for (i = 0; i < N; i++) {
+        printf("array[%d] = %d\n", i, array[i]);
+    }
+
+    return 0;
+}
+
+
+gcc -fopenmp parallel_loop_openmp.c -o parallel_loop_openmp
+./parallel_loop_openmp
+
+
+
+3. create scripts to run MPI helloWorld and an example for parallel loop
+
+
+module load mpich/ge/gcc/64/3.2rc2
+
+
+#include <mpi.h>
+#include <stdio.h>
+
+int main(int argc, char** argv) {
+    MPI_Init(&argc, &argv);  // Initialize the MPI environment
+
+    int world_size;
+    MPI_Comm_size(MPI_COMM_WORLD, &world_size);  // Get the number of processes
+
+    int world_rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);  // Get the rank of the process
+
+    printf("Hello World from process %d of %d\n", world_rank, world_size);
+
+    MPI_Finalize();  // Finalize the MPI environment
+    return 0;
+}
+
+
+mpicc hello_mpi.c -o hello_mpi  # Compile the program
+mpirun -np 4 ./hello_mpi        # Run the program with 4 processes
+
+
+#include <mpi.h>
+#include <stdio.h>
+
+#define N 100
+
+int main(int argc, char** argv) {
+    MPI_Init(&argc, &argv);  // Initialize the MPI environment
+
+    int world_size;
+    MPI_Comm_size(MPI_COMM_WORLD, &world_size);  // Get the number of processes
+
+    int world_rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);  // Get the rank of the process
+
+    int array[N];
+    int local_N = N / world_size;  // Number of elements per process
+
+    // Initialize the array in the root process (rank 0)
+    if (world_rank == 0) {
+        for (int i = 0; i < N; i++) {
+            array[i] = i;
+        }
+    }
+
+    // Scatter the array to all processes
+    int local_array[local_N];
+    MPI_Scatter(array, local_N, MPI_INT, local_array, local_N, MPI_INT, 0, MPI_COMM_WORLD);
+
+    // Perform the computation locally
+    for (int i = 0; i < local_N; i++) {
+        local_array[i] = local_array[i] * local_array[i];
+    }
+
+    // Gather the results back to the root process
+    MPI_Gather(local_array, local_N, MPI_INT, array, local_N, MPI_INT, 0, MPI_COMM_WORLD);
+
+    // Print the results in the root process
+    if (world_rank == 0) {
+        for (int i = 0; i < N; i++) {
+            printf("array[%d] = %d\n", i, array[i]);
+        }
+    }
+
+    MPI_Finalize();  // Finalize the MPI environment
+    return 0;
+}
+
+
+
+mpicc -std=c99 parallel_loop_mpi.c -o parallel_loop_mpi
+mpirun -np 4 ./parallel_loop_mpi
+
+
+
+2. Write serial and MPI versions of blowfish
+
+
+blowfish_serial.c
+
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <time.h>
+#include <openssl/blowfish.h>
+
+#define DATA_SIZE 100000000 // Larger dataset to exaggerate difference
+
+void blowfish_encrypt(BF_KEY *key, uint8_t *data, size_t data_len, uint8_t *encrypted) {
+    size_t i;
+    for (i = 0; i < data_len; i += 8) {
+        BF_ecb_encrypt(data + i, encrypted + i, key, BF_ENCRYPT);
+    }
+}
+
+void blowfish_decrypt(BF_KEY *key, uint8_t *data, size_t data_len, uint8_t *decrypted) {
+    size_t i;
+    for (i = 0; i < data_len; i += 8) {
+        BF_ecb_encrypt(data + i, decrypted + i, key, BF_DECRYPT);
+    }
+}
+
+int main() {
+    BF_KEY key;
+    uint8_t *data = malloc(DATA_SIZE);
+    uint8_t *encrypted = malloc(DATA_SIZE + 8); // Adding padding space
+    uint8_t *decrypted = malloc(DATA_SIZE);
+
+    // Initialize data
+    size_t i;
+    for (i = 0; i < DATA_SIZE; ++i) {
+        data[i] = 'A' + (i % 26);
+    }
+
+    uint8_t key_data[16];
+    for (i = 0; i < 16; ++i) {
+        key_data[i] = rand() % 256;
+    }
+
+    BF_set_key(&key, 16, key_data);
+
+    clock_t start, end;
+    double cpu_time_used;
+
+    start = clock();
+    blowfish_encrypt(&key, data, DATA_SIZE, encrypted);
+    end = clock();
+    cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
+    printf("Encryption time (serial): %f seconds\n", cpu_time_used);
+
+    start = clock();
+    blowfish_decrypt(&key, encrypted, DATA_SIZE, decrypted);
+    end = clock();
+    cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
+    printf("Decryption time (serial): %f seconds\n", cpu_time_used);
+
+    // Validate the decrypted data
+    if (memcmp(data, decrypted, DATA_SIZE) != 0) {
+        printf("Decrypted data does not match original data!\n");
+    } else {
+        printf("Decrypted data matches original data!\n");
+    }
+
+    free(data);
+    free(encrypted);
+    free(decrypted);
+
+    return 0;
+}
+
+
+
+
+gcc blowfish_serial.c -o blowfish_serial -lcrypto -std=99
+./blowfish_serial
+
+
+
+
+
+blowfish_mpi.c
+
+#include <mpi.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <time.h>
+#include <openssl/blowfish.h>
+
+#define DATA_SIZE 100000000 // Larger dataset to exaggerate difference
+
+void blowfish_encrypt(BF_KEY *key, uint8_t *data, size_t data_len, uint8_t *encrypted) {
+    size_t i;
+    for (i = 0; i < data_len; i += 8) {
+        BF_ecb_encrypt(data + i, encrypted + i, key, BF_ENCRYPT);
+    }
+}
+
+void blowfish_decrypt(BF_KEY *key, uint8_t *data, size_t data_len, uint8_t *decrypted) {
+    size_t i;
+    for (i = 0; i < data_len; i += 8) {
+        BF_ecb_encrypt(data + i, decrypted + i, key, BF_DECRYPT);
+    }
+}
+
+int main(int argc, char** argv) {
+    MPI_Init(&argc, &argv);
+
+    int rank, size;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+    BF_KEY key;
+    uint8_t key_data[16];
+    size_t i;
+    for (i = 0; i < 16; ++i) {
+        key_data[i] = rand() % 256;
+    }
+    BF_set_key(&key, 16, key_data);
+
+    size_t chunk_size = DATA_SIZE / size;
+    uint8_t *local_data = malloc(chunk_size);
+    uint8_t *local_encrypted = malloc(chunk_size + 8); // Adding padding space
+    uint8_t *local_decrypted = malloc(chunk_size);
+
+    if (rank == 0) {
+        uint8_t *data = malloc(DATA_SIZE);
+        uint8_t *encrypted = malloc(DATA_SIZE + 8 * size); // Adding padding space
+        uint8_t *decrypted = malloc(DATA_SIZE);
+
+        // Initialize data
+        for (i = 0; i < DATA_SIZE; ++i) {
+            data[i] = 'A' + (i % 26);
+        }
+
+        double start, end;
+
+        // Scatter data to all processes
+        MPI_Scatter(data, chunk_size, MPI_BYTE, local_data, chunk_size, MPI_BYTE, 0, MPI_COMM_WORLD);
+
+        start = MPI_Wtime();
+        blowfish_encrypt(&key, local_data, chunk_size, local_encrypted);
+        end = MPI_Wtime();
+        printf("Process %d encryption time: %f seconds\n", rank, end - start);
+
+        // Gather encrypted data from all processes
+        MPI_Gather(local_encrypted, chunk_size, MPI_BYTE, encrypted, chunk_size, MPI_BYTE, 0, MPI_COMM_WORLD);
+
+        start = MPI_Wtime();
+        blowfish_decrypt(&key, encrypted, DATA_SIZE, decrypted);
+        end = MPI_Wtime();
+        printf("Decryption time (parallel): %f seconds\n", end - start);
+
+        // Validate the decrypted data
+        if (memcmp(data, decrypted, DATA_SIZE) != 0) {
+            printf("Decrypted data does not match original data!\n");
+        } else {
+            printf("Decrypted data matches original data!\n");
+        }
+
+        free(data);
+        free(encrypted);
+        free(decrypted);
+    } else {
+        // Receive data chunk from root process
+        MPI_Scatter(NULL, chunk_size, MPI_BYTE, local_data, chunk_size, MPI_BYTE, 0, MPI_COMM_WORLD);
+
+        double start, end;
+
+        start = MPI_Wtime();
+        blowfish_encrypt(&key, local_data, chunk_size, local_encrypted);
+        end = MPI_Wtime();
+        printf("Process %d encryption time: %f seconds\n", rank, end - start);
+
+        // Send encrypted data chunk to root process
+        MPI_Gather(local_encrypted, chunk_size, MPI_BYTE, NULL, chunk_size, MPI_BYTE, 0, MPI_COMM_WORLD);
+    }
+
+    free(local_data);
+    free(local_encrypted);
+    free(local_decrypted);
+
+    MPI_Finalize();
+
+    return 0;
+}
+
+
+
+
+mpicc blowfish_mpi.c -o blowfish_mpi -lcrypto
+mpirun -np 4 ./blowfish_mpi
+
+
+
+./blowfish_serial
+Encryption time (serial): 0.660000 seconds
+Decryption time (serial): 0.640000 seconds
+
+mpirun -np 4 ./blowfish_mpi
+Process 1 encryption time: 0.168639 seconds
+Process 3 encryption time: 0.174601 seconds
+Process 0 encryption time: 0.174799 seconds
+Process 2 encryption time: 0.174592 seconds
+Decryption time (parallel): 0.700148 seconds
+
+
+
+encryption time of each process in the mpi task is 4 times faster than the serial job. The decryption is slightly worse than serial job because of the 
+overhead  in gathering the data from multiple processes.
+
+using SLURM:
+
+-N 1
+-p 128GB
+
+Encryption time (serial): 0.960000 seconds
+Decryption time (serial): 0.960000 seconds
+Decrypted data matches original data!
+
+
+-N 2
+-p 128GB
+
+Process 1 encryption time: 0.476727 seconds
+Process 0 encryption time: 0.476585 seconds
+Decryption time (parallel): 0.954134 seconds
+Decrypted data matches original data!
+
+
+-N 3
+-p 128GB
+
+Process 1 encryption time: 0.326551 seconds
+Process 2 encryption time: 0.323678 seconds
+Process 0 encryption time: 0.323797 seconds
+Decryption time (parallel): 0.970143 seconds
+
+
+
+-N 4
+-p 128GB 
+
+Process 1 encryption time: 0.243206 seconds
+Process 3 encryption time: 0.243105 seconds
+Process 0 encryption time: 0.240108 seconds
+Process 2 encryption time: 0.240205 seconds
+Decryption time (parallel): 0.961980 seconds
+Decrypted data matches original data!
+
+
+-N 5
+-p 128GB
+
+Process 3 encryption time: 0.193014 seconds	
+Process 4 encryption time: 0.207064 seconds
+Process 0 encryption time: 0.196599 seconds
+Process 2 encryption time: 0.196809 seconds
+Process 1 encryption time: 0.205737 seconds
+Decryption time (parallel): 0.983177 seconds
+Decrypted data matches original data!
+
+
+-N 6
+-p 128GB
+
+Process 5 encryption time: 0.164433 seconds
+Process 4 encryption time: 0.163639 seconds
+Process 1 encryption time: 0.161994 seconds
+Process 3 encryption time: 0.163783 seconds
+Process 2 encryption time: 0.162038 seconds
+Process 0 encryption time: 0.163603 seconds
+Decryption time (parallel): 1.002695 seconds
+
+
+
+
+
+
+Key Concepts regarding Process, Threads, Multiprocessing, Multithreading, MPI
+
+Process:
+
+A process is an instance of a program that is being executed. It contains the program code and its current activity.
+Each process has its own separate memory space. This means that processes do not share memory with each other. They have their own address space, stack, and heap.
+Processes are isolated from each other. A crash in one process does not affect other processes. This makes them more robust for executing separate tasks.
+Since processes do not share memory, they need to use IPC mechanisms like pipes, sockets, shared memory, or message passing to communicate with each other.
+Creating and managing processes has more overhead compared to threads because each process requires its own memory space and system resources.
+Examples: Web browsers (where each tab might be a separate process), operating system services, and servers
+
+
+Thread:
+
+A thread is the smallest unit of execution within a process. Multiple threads can exist within the same process and share the same memory space.
+A thread is the smallest unit of execution within a process. Multiple threads can exist within the same process and share the same memory space.
+Since threads share the same memory space, they can communicate with each other more easily and efficiently than processes. This allows for faster context switching and data sharing.
+Creating and managing threads has less overhead compared to processes because threads share resources of the parent process.
+Threads are not isolated from each other. A crash in one thread can potentially bring down the entire process, affecting all other threads within that process.
+Multithreaded applications like web servers (where each thread handles a client request), GUI applications (where one thread handles user input while another performs background tasks), and parallel algorithms
+
+
+
+Processes cannot run in true parallelism on a single-core system. On a single-core system, only one process can execute at a time. However, multiple processes can achieve concurrency through context switching, where the operating system rapidly switches between processes, giving the illusion that they are running simultaneously.
+Threads within the same process share the same memory space, which is more efficient for communication but requires careful synchronization to avoid race conditions.
+Similar to processes, there are operating system-imposed limits on the number of threads per process and the total memory that can be allocated.
+
+Parallelism:
+
+Definition: Performing multiple operations simultaneously.
+Requirement: Requires multiple CPU cores.
+Execution: True parallelism is only possible on multi-core systems, where different processes or threads can be executed on different cores at the same time.
+
+Concurrency:
+
+Definition: Managing multiple tasks at the same time.
+Requirement: Can be achieved on single-core or multi-core systems.
+Execution: On a single-core system, concurrency is achieved through context switching, where the CPU switches between processes or threads, giving the appearance of simultaneous execution.
+
+Single-Core System
+
+Concurrency: The operating system uses context switching to manage multiple processes. It rapidly switches between them, so each process gets a slice of CPU time.
+Parallelism: True parallelism is not possible because there is only one CPU core available to execute instructions.
+
+Multi-Core System
+Concurrency: Concurrency is still achieved through context switching, but with multiple cores, some processes can run truly in parallel.
+Parallelism: True parallelism is achieved because multiple processes or threads can be executed on different cores simultaneously.