2025-03-12
HPC

High Performance Computing

3/26

Ring_Allreduce：define a MPI_Allreduce using Ring arithmetic
The code is as follows:
#include <chrono>
#include <iostream>
#include <mpi.h>
#include <time.h>
#include <cstring>
#include <cmath>
#include <algorithm>

#define EPS 1e-5

namespace ch = std::chrono;

void Ring_Allreduce(void* sendbuf, void* recvbuf, int n, MPI_Comm comm, int comm_sz, int my_rank)
{   
    int chunk = (n + comm_sz - 1) / comm_sz;
    int last_chunk = n - (comm_sz-1)*chunk;
    int total_bytes = n * sizeof(float);
    float* sendbuf_float = (float*)sendbuf;
    float* recvbuf_float = (float*)recvbuf;
    float* tmpbuf_float = (float*)malloc(total_bytes);
    memcpy(recvbuf, sendbuf, total_bytes);


    for (int i = 0; i < comm_sz - 1; ++i){

        MPI_Request send_req,recv_req;
        if ((my_rank - i + comm_sz) % comm_sz == comm_sz - 1){
            MPI_Isend(recvbuf_float + (my_rank - i + comm_sz) % comm_sz *chunk, last_chunk, MPI_FLOAT, (my_rank + 1) % comm_sz, 0, comm,&send_req);
            MPI_Irecv(tmpbuf_float+(my_rank - 1 +comm_sz - i) % comm_sz *chunk, chunk, MPI_FLOAT, (my_rank - 1 + comm_sz) % comm_sz, 0, comm,&recv_req);
        }else if ((my_rank -1 + comm_sz - i) % comm_sz == comm_sz -1){
            // last_chunk = n - (comm_sz-1)*chunk;
            MPI_Isend(recvbuf_float + (my_rank - i + comm_sz) % comm_sz *chunk, chunk, MPI_FLOAT, (my_rank + 1) % comm_sz, 0, comm,&send_req);
            MPI_Irecv(tmpbuf_float+(my_rank - 1 +comm_sz - i) % comm_sz *chunk, last_chunk, MPI_FLOAT, (my_rank - 1 + comm_sz) % comm_sz, 0, comm,&recv_req);
        }else{
            MPI_Isend(recvbuf_float + (my_rank - i + comm_sz) % comm_sz *chunk , chunk, MPI_FLOAT, (my_rank + 1) % comm_sz, 0, comm,&send_req);
            MPI_Irecv(tmpbuf_float+(my_rank - 1 +comm_sz - i) % comm_sz *chunk , chunk, MPI_FLOAT, (my_rank - 1 + comm_sz) % comm_sz, 0, comm,&recv_req);
        }
        MPI_Wait(&send_req, MPI_STATUS_IGNORE);
        MPI_Wait(&recv_req, MPI_STATUS_IGNORE);

        int special_chunk = chunk;
        if ((my_rank -1 + comm_sz - i) % comm_sz == comm_sz -1){
            special_chunk = last_chunk;
        }
        for (int j = 0; j < special_chunk; ++j){
            recvbuf_float[j+(my_rank - 1 +comm_sz - i) % comm_sz*chunk] += tmpbuf_float[j + (my_rank - 1 -i + comm_sz) % comm_sz*chunk];
        }
    }  


    for (int i = 0; i < comm_sz-1; ++i){
        MPI_Request send_req,recv_req;
        int source_chunk = (my_rank + 1 - i + comm_sz) % comm_sz; 
        int dest_chunk = (my_rank - i + comm_sz) % comm_sz;
        if (source_chunk == comm_sz - 1){
            MPI_Isend(recvbuf_float + source_chunk*chunk, last_chunk, MPI_FLOAT, (my_rank + 1) % comm_sz, 0, comm,&send_req);
            MPI_Irecv(recvbuf_float + dest_chunk*chunk, chunk, MPI_FLOAT, (my_rank - 1 + comm_sz) % comm_sz, 0, comm,&recv_req);
        }else if (dest_chunk == comm_sz - 1){
            MPI_Isend(recvbuf_float + source_chunk*chunk, chunk, MPI_FLOAT, (my_rank + 1) % comm_sz, 0, comm,&send_req);
            MPI_Irecv(recvbuf_float + dest_chunk*chunk,last_chunk, MPI_FLOAT, (my_rank - 1 + comm_sz) % comm_sz, 0, comm,&recv_req);
        }else{
            MPI_Isend(recvbuf_float + source_chunk*chunk, chunk, MPI_FLOAT, (my_rank + 1) % comm_sz, 0, comm,&send_req);
            MPI_Irecv(recvbuf_float + dest_chunk*chunk, chunk, MPI_FLOAT, (my_rank - 1 + comm_sz) % comm_sz, 0, comm,&recv_req);
        }
        MPI_Wait(&send_req, MPI_STATUS_IGNORE);
        MPI_Wait(&recv_req, MPI_STATUS_IGNORE);
    }

    free(tmpbuf_float);
}

// reduce + bcast
void Naive_Allreduce(void* sendbuf, void* recvbuf, int n, MPI_Comm comm, int comm_sz, int my_rank)
{
    MPI_Reduce(sendbuf, recvbuf, n, MPI_FLOAT, MPI_SUM, 0, comm);
    MPI_Bcast(recvbuf, n, MPI_FLOAT, 0, comm);
}

int main(int argc, char *argv[])
{
    int ITER = atoi(argv[1]);
    int n = atoi(argv[2]);
    float* mpi_sendbuf = new float[n];
    float* mpi_recvbuf = new float[n];
    float* naive_sendbuf = new float[n];
    float* naive_recvbuf = new float[n];
    float* ring_sendbuf = new float[n];
    float* ring_recvbuf = new float[n];

    MPI_Init(nullptr, nullptr);
    int comm_sz;
    int my_rank;
    MPI_Comm_size(MPI_COMM_WORLD, &comm_sz);
    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
  
    srand(time(NULL) + my_rank);
    for (int i = 0; i < n; ++i)
        mpi_sendbuf[i] = static_cast <float> (rand()) / static_cast <float> (RAND_MAX);
    memcpy(naive_sendbuf, mpi_sendbuf, n * sizeof(float));
    memcpy(ring_sendbuf, mpi_sendbuf, n * sizeof(float));

    //warmup and check
    MPI_Allreduce(mpi_sendbuf, mpi_recvbuf, n, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);
    Naive_Allreduce(naive_sendbuf, naive_recvbuf, n, MPI_COMM_WORLD, comm_sz, my_rank);
    Ring_Allreduce(ring_sendbuf, ring_recvbuf, n, MPI_COMM_WORLD, comm_sz, my_rank);
    bool correct = true;
    for (int i = 0; i < n; ++i)
        if (abs(mpi_recvbuf[i] - ring_recvbuf[i]) > EPS)
        {   
            correct = false;
            break;
        }

    if (correct)
    {
        auto beg = ch::high_resolution_clock::now();
        for (int iter = 0; iter < ITER; ++iter)
            MPI_Allreduce(mpi_sendbuf, mpi_recvbuf, n, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);
        auto end = ch::high_resolution_clock::now();
        double mpi_dur = ch::duration_cast<ch::duration<double>>(end - beg).count() * 1000; //ms

        beg = ch::high_resolution_clock::now();
        for (int iter = 0; iter < ITER; ++iter)
            Naive_Allreduce(naive_sendbuf, naive_recvbuf, n, MPI_COMM_WORLD, comm_sz, my_rank);
        end = ch::high_resolution_clock::now();
        double naive_dur = ch::duration_cast<ch::duration<double>>(end - beg).count() * 1000; //ms

        beg = ch::high_resolution_clock::now();
        for (int iter = 0; iter < ITER; ++iter)
            Ring_Allreduce(ring_sendbuf, ring_recvbuf, n, MPI_COMM_WORLD, comm_sz, my_rank);
        end = ch::high_resolution_clock::now();
        double ring_dur = ch::duration_cast<ch::duration<double>>(end - beg).count() * 1000; //ms
      
        if (my_rank == 0)
        {
            std::cout << "Correct." << std::endl;
            std::cout << "MPI_Allreduce:   " << mpi_dur << " ms." << std::endl;
            std::cout << "Naive_Allreduce: " << naive_dur << " ms." << std::endl;
            std::cout << "Ring_Allreduce:  " << ring_dur << " ms." << std::endl;
        }
    }
    else
        if (my_rank == 0)
            std::cout << "Wrong!" << std::endl;

    delete[] mpi_sendbuf;
    delete[] mpi_recvbuf;
    delete[] naive_sendbuf;
    delete[] naive_recvbuf;
    delete[] ring_sendbuf;
    delete[] ring_recvbuf;
    MPI_Finalize();
    return 0;
}