diff --git a/L21/key/mpi_stdev_v1.c b/L21/key/mpi_stdev_v1.c
new file mode 100644
index 0000000000000000000000000000000000000000..c4302d3d398abdb0d7f6fa74a6f510045ef01432
--- /dev/null
+++ b/L21/key/mpi_stdev_v1.c
@@ -0,0 +1,109 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <mpi.h>
+
+int main (int argc, char* argv[]) {
+
+    MPI_Init (&argc, &argv);
+
+    // MPI_COMM_WORLD is the default communicator that contains all ranks
+    int rank, size;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    char node_name[MPI_MAX_PROCESSOR_NAME];
+    int node_name_len;
+    MPI_Get_processor_name(node_name,&node_name_len);
+
+    // make sure we are not running on a login node!
+    if ((strcmp(node_name,"tinkercliffs1") == 0) || 
+            (strcmp(node_name,"tinkercliffs2") == 0)) {
+        printf ("error : running on login node %s!\n",node_name);
+        return 1;
+    }
+
+    // get N from the command line
+    if (argc < 2) {
+        printf ("Command usage : %s %s\n",argv[0],"N");
+        return 1;
+    }
+    long long N = atoll(argv[1]);
+
+    // start the timer
+    double start_time, end_time;
+    start_time = MPI_Wtime();
+
+    // each rank computes a partial sum
+    long long sum = 0;
+    for (long long i=1+rank;i<=N;i+=size) {
+        sum += i;
+    }
+
+    // rank 0 sums up partial sums from each rank
+    if (rank == 0) {
+        long long rank_sum;
+        MPI_Status status;
+        for (int source=1;source<size;source++) {
+            MPI_Recv (&rank_sum,1,MPI_LONG_LONG,source,0,MPI_COMM_WORLD,&status);
+            sum += rank_sum;
+        }
+    } else {
+        int dest = 0;
+        MPI_Send (&sum,1,MPI_LONG_LONG,dest,0,MPI_COMM_WORLD);
+    }
+
+    // rank 0 broadcasts the total sum to all other ranks
+    if (rank == 0) {
+        for (int dest=1;dest<size;dest++) {
+            MPI_Send (&sum,1,MPI_LONG_LONG,dest,0,MPI_COMM_WORLD);
+        }
+    } else {
+        int source = 0;
+        MPI_Status status;
+        MPI_Recv (&sum,1,MPI_LONG_LONG,source,0,MPI_COMM_WORLD,&status);
+    } 
+
+    // every rank has the correct sum and can now compute the mean
+    double mean = 1.0*sum/N;
+
+    // each rank computes a partial sum of difference squares
+    double sum_diff_sq = 0;
+    for (long long i=1+rank;i<=N;i+=size) {
+        sum_diff_sq += (i-mean)*(i-mean);
+    }
+
+    // rank 0 sums up partial sums of difference squares from each rank
+    if (rank == 0) {
+        double rank_sum_diff_sq;
+        MPI_Status status;
+        for (int source=1;source<size;source++) {
+            MPI_Recv (&rank_sum_diff_sq,1,MPI_DOUBLE,source,0,MPI_COMM_WORLD,&status);
+            sum_diff_sq += rank_sum_diff_sq;
+        }
+    } else {
+        int dest = 0;
+        MPI_Send (&sum_diff_sq,1,MPI_DOUBLE,dest,0,MPI_COMM_WORLD);
+    }
+
+    // only rank 0 has the correct sum of diff sqs
+    // and can now compute the correct variance
+    // (other ranks will not compute the correct variance)
+    double variance = sum_diff_sq/N;
+
+    // calculate the standard deviation
+    double stdev = sqrt(variance);
+
+    // stop the timer
+    end_time = MPI_Wtime();
+
+    // only rank 0 has the correct standard deviation
+    if (rank == 0) {
+        printf ("num ranks = %d, elapsed time = %g\n",size,end_time-start_time);
+        printf ("standard deviation is %.3lf, sqrt((N^2-1)/12) is %.3lf\n",
+                stdev,sqrt((N*N-1)/12.0));
+    }
+    
+    MPI_Finalize();
+}
+
diff --git a/L21/key/mpi_stdev_v2.c b/L21/key/mpi_stdev_v2.c
new file mode 100644
index 0000000000000000000000000000000000000000..2385a3c91585e58c1674521f5013dde928f718b0
--- /dev/null
+++ b/L21/key/mpi_stdev_v2.c
@@ -0,0 +1,127 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <mpi.h>
+
+int main (int argc, char* argv[]) {
+
+    MPI_Init (&argc, &argv);
+
+    // MPI_COMM_WORLD is the default communicator that contains all ranks
+    int rank, size;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    char node_name[MPI_MAX_PROCESSOR_NAME];
+    int node_name_len;
+    MPI_Get_processor_name(node_name,&node_name_len);
+
+    // make sure we are not running on a login node!
+    if ((strcmp(node_name,"tinkercliffs1") == 0) || 
+            (strcmp(node_name,"tinkercliffs2") == 0)) {
+        printf ("error : running on login node %s!\n",node_name);
+        return 1;
+    }
+
+    // get N from the command line
+    if (argc < 2) {
+        printf ("Command usage : %s %s\n",argv[0],"N");
+        return 1;
+    }
+    long long N = atoll(argv[1]);
+
+    // start the timer
+    double start_time, end_time;
+    start_time = MPI_Wtime();
+
+    // each rank computes a partial sum
+    long long sum = 0;
+    for (long long i=1+rank;i<=N;i+=size) {
+        sum += i;
+    }
+
+    // use parallel message passing to reduce the partial sums with result on rank 0
+    // we assume that size = 2^k for some integer k >= 0
+    int alive = size;
+    while (alive > 1) {
+	if (rank < alive/2) {
+	    // rank is a receiver
+	    long long rank_sum;
+	    MPI_Status status;
+	    int src = rank + alive/2;
+            MPI_Recv (&rank_sum, 1, MPI_LONG_LONG, src, 0, MPI_COMM_WORLD, &status);
+            sum += rank_sum;
+	} else if (rank < alive) {
+	    // rank is a sender */
+	    int dest = rank - alive/2;
+	    MPI_Send (&sum, 1, MPI_LONG_LONG, dest, 0, MPI_COMM_WORLD);
+	}
+	alive = alive/2;
+    }
+
+    // use parallel message passing to broadcast the sum on rank 0 to all other ranks
+    // we assume that size = 2^k for some integer k >= 0
+    alive = 1;
+    while (alive < size) {
+	alive = alive*2;
+	if (rank < alive/2) {
+	    // rank is a sender
+	    int dest = rank + alive/2;
+	    MPI_Send (&sum, 1, MPI_LONG_LONG, dest, 0, MPI_COMM_WORLD);
+	} else if (rank < alive) {
+	    // rank is a receiver */
+	    MPI_Status status;
+	    int src = rank - alive/2;
+            MPI_Recv (&sum, 1, MPI_LONG_LONG, src, 0, MPI_COMM_WORLD, &status);
+	}
+    }
+
+    // every rank has the correct sum and can now compute the mean
+    double mean = 1.0*sum/N;
+
+    // each rank computes a partial sum of difference squares
+    double sum_diff_sq = 0;
+    for (long long i=1+rank;i<=N;i+=size) {
+        sum_diff_sq += (i-mean)*(i-mean);
+    }
+
+    // use parallel message passing to reduce the partial sums of difference squares 
+    // with result on rank 0 (we assume that size = 2^k for some integer k >= 0)
+    alive = size;
+    while (alive > 1) {
+	if (rank < alive/2) {
+	    // rank is a receiver
+	    double rank_sum_diff_sq;
+	    MPI_Status status;
+	    int src = rank + alive/2;
+            MPI_Recv (&rank_sum_diff_sq, 1, MPI_DOUBLE, src, 0, MPI_COMM_WORLD, &status);
+            sum_diff_sq += rank_sum_diff_sq;
+	} else if (rank < alive) {
+	    // rank is a sender */
+	    int dest = rank - alive/2;
+	    MPI_Send (&sum_diff_sq, 1, MPI_DOUBLE, dest, 0, MPI_COMM_WORLD);
+	}
+	alive = alive/2;
+    }
+
+    // only rank 0 has the correct sum of diff sqs
+    // and can now compute the correct variance
+    // (other ranks will not compute the correct variance)
+    double variance = sum_diff_sq/N;
+
+    // calculate the standard deviation
+    double stdev = sqrt(variance);
+
+    // stop the timer
+    end_time = MPI_Wtime();
+
+    // only rank 0 has the correct standard deviation
+    if (rank == 0) {
+        printf ("num ranks = %d, elapsed time = %g\n",size,end_time-start_time);
+        printf ("standard deviation is %.3lf, sqrt((N^2-1)/12) is %.3lf\n",
+                stdev,sqrt((N*N-1)/12.0));
+    }
+    
+    MPI_Finalize();
+}
+
diff --git a/L21/key/mpi_stdev_v3.c b/L21/key/mpi_stdev_v3.c
new file mode 100644
index 0000000000000000000000000000000000000000..a6fc31c2f37428cc5640e331827306272e30f9e0
--- /dev/null
+++ b/L21/key/mpi_stdev_v3.c
@@ -0,0 +1,83 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <mpi.h>
+
+int main (int argc, char* argv[]) {
+
+    MPI_Init (&argc, &argv);
+
+    // MPI_COMM_WORLD is the default communicator that contains all ranks
+    int rank, size;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    char node_name[MPI_MAX_PROCESSOR_NAME];
+    int node_name_len;
+    MPI_Get_processor_name(node_name,&node_name_len);
+
+    // make sure we are not running on a login node!
+    if ((strcmp(node_name,"tinkercliffs1") == 0) || 
+            (strcmp(node_name,"tinkercliffs2") == 0)) {
+        printf ("error : running on login node %s!\n",node_name);
+        return 1;
+    }
+
+    // get N from the command line
+    if (argc < 2) {
+        printf ("Command usage : %s %s\n",argv[0],"N");
+        return 1;
+    }
+    long long N = atoll(argv[1]);
+
+    // start the timer
+    double start_time, end_time;
+    start_time = MPI_Wtime();
+
+    // each rank computes a partial sum
+    long long sum = 0;
+    for (long long i=1+rank;i<=N;i+=size) {
+        sum += i;
+    }
+
+    // rank 0 sums up partial sums from each rank
+    long long rank_sum = sum;
+    MPI_Reduce(&rank_sum,&sum,1,MPI_LONG_LONG,MPI_SUM,0,MPI_COMM_WORLD);
+
+    // rank 0 broadcasts the total sum to all other ranks
+    MPI_Bcast(&sum,1,MPI_LONG_LONG,0,MPI_COMM_WORLD);
+
+    // every rank has the correct sum and can now compute the mean
+    double mean = 1.0*sum/N;
+
+    // each rank computes a partial sum of difference squares
+    double sum_diff_sq = 0;
+    for (long long i=1+rank;i<=N;i+=size) {
+        sum_diff_sq += (i-mean)*(i-mean);
+    }
+
+    // rank 0 sums up partial sums of difference squares from each rank
+    double rank_sum_diff_sq = sum_diff_sq;
+    MPI_Reduce(&rank_sum_diff_sq,&sum_diff_sq,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
+
+    // only rank 0 has the correct sum of diff sqs
+    // and can now compute the correct variance
+    // (other ranks will not compute the correct variance)
+    double variance = sum_diff_sq/N;
+
+    // calculate the standard deviation
+    double stdev = sqrt(variance);
+
+    // stop the timer
+    end_time = MPI_Wtime();
+
+    // only rank 0 has the correct standard deviation
+    if (rank == 0) {
+        printf ("num ranks = %d, elapsed time = %g\n",size,end_time-start_time);
+        printf ("standard deviation is %.3lf, sqrt((N^2-1)/12) is %.3lf\n",
+                stdev,sqrt((N*N-1)/12.0));
+    }
+    
+    MPI_Finalize();
+}
+
diff --git a/L21/key/mpi_stdev_v4.c b/L21/key/mpi_stdev_v4.c
new file mode 100644
index 0000000000000000000000000000000000000000..63de6d39bd0c5df64a313332e4f759cb57e99f7d
--- /dev/null
+++ b/L21/key/mpi_stdev_v4.c
@@ -0,0 +1,81 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <mpi.h>
+
+int main (int argc, char* argv[]) {
+
+    MPI_Init (&argc, &argv);
+
+    // MPI_COMM_WORLD is the default communicator that contains all ranks
+    int rank, size;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    char node_name[MPI_MAX_PROCESSOR_NAME];
+    int node_name_len;
+    MPI_Get_processor_name(node_name,&node_name_len);
+
+    // make sure we are not running on a login node!
+    if ((strcmp(node_name,"tinkercliffs1") == 0) || 
+            (strcmp(node_name,"tinkercliffs2") == 0)) {
+        printf ("error : running on login node %s!\n",node_name);
+        return 1;
+    }
+
+    // get N from the command line
+    if (argc < 2) {
+        printf ("Command usage : %s %s\n",argv[0],"N");
+        return 1;
+    }
+    long long N = atoll(argv[1]);
+
+    // start the timer
+    double start_time, end_time;
+    start_time = MPI_Wtime();
+
+    // each rank computes a partial sum
+    long long sum = 0;
+    for (long long i=1+rank;i<=N;i+=size) {
+        sum += i;
+    }
+
+    // rank 0 sums up partial sums from each rank
+    // rank 0 broadcasts the total sum to all other ranks
+    // the MPI_IN_PLACE option is available for MPI_Allreduce but not MPI_Reduce!
+    MPI_Allreduce(MPI_IN_PLACE,&sum,1,MPI_LONG_LONG,MPI_SUM,MPI_COMM_WORLD);
+
+    // every rank has the correct sum and can now compute the mean
+    double mean = 1.0*sum/N;
+
+    // each rank computes a partial sum of difference squares
+    double sum_diff_sq = 0;
+    for (long long i=1+rank;i<=N;i+=size) {
+        sum_diff_sq += (i-mean)*(i-mean);
+    }
+
+    // rank 0 sums up partial sums of difference squares from each rank
+    double rank_sum_diff_sq = sum_diff_sq;
+    MPI_Reduce(&rank_sum_diff_sq,&sum_diff_sq,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
+
+    // only rank 0 has the correct sum of diff sqs
+    // and can now compute the correct variance
+    // (other ranks will not compute the correct variance)
+    double variance = sum_diff_sq/N;
+
+    // calculate the standard deviation
+    double stdev = sqrt(variance);
+
+    // stop the timer
+    end_time = MPI_Wtime();
+
+    // only rank 0 has the correct standard deviation
+    if (rank == 0) {
+        printf ("num ranks = %d, elapsed time = %g\n",size,end_time-start_time);
+        printf ("standard deviation is %.3lf, sqrt((N^2-1)/12) is %.3lf\n",
+                stdev,sqrt((N*N-1)/12.0));
+    }
+    
+    MPI_Finalize();
+}
+
diff --git a/L21/key/mpi_sum_v1.c b/L21/key/mpi_sum_v1.c
new file mode 100644
index 0000000000000000000000000000000000000000..c524441a604efa616f46e4dc65ec01857fd36455
--- /dev/null
+++ b/L21/key/mpi_sum_v1.c
@@ -0,0 +1,65 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <mpi.h>
+
+int main(int argc, char *argv[]) {
+
+    MPI_Init (&argc, &argv);
+
+    // MPI_COMM_WORLD is the default communicator that contains all ranks
+    int rank, size;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    char node_name[MPI_MAX_PROCESSOR_NAME];
+    int node_name_len;
+    MPI_Get_processor_name(node_name,&node_name_len);
+
+    // make sure we are not running on a login node!
+    if ((strcmp(node_name,"tinkercliffs1") == 0) || 
+            (strcmp(node_name,"tinkercliffs2") == 0)) {
+        printf ("error : running on login node %s!\n",node_name);
+        return 1;
+    }
+
+    // get N from command line
+    if (argc < 2) {
+        printf ("Command usage : %s %s\n",argv[0],"N");
+        return 1;
+    }
+    long long N = atoll(argv[1]);
+
+    // start the timer
+    double start_time, end_time;
+    start_time = MPI_Wtime();
+
+    // calculate the sum
+    long long sum = 0;
+    for (long long i = 1+rank; i <= N;i+=size) {
+        sum += i;
+    }
+
+    // all nonzero ranks send their partial sums to rank 0
+    if (rank == 0) {
+        long long rank_sum;
+        MPI_Status status;
+        for (int src = 1;src < size;src++) {
+            MPI_Recv(&rank_sum,1,MPI_LONG_LONG,src,0,MPI_COMM_WORLD,&status);
+            sum += rank_sum;
+        }
+    } else {
+        int dest = 0;
+        MPI_Send(&sum,1,MPI_LONG_LONG,dest,0,MPI_COMM_WORLD);
+    }
+
+    // stop the timer
+    end_time = MPI_Wtime();
+
+    // print results
+    if (rank == 0) {
+        printf ("elapsed time = %.4f seconds\n",end_time-start_time);
+	printf ("sum = %lld, N*(N+1)/2 = %lld\n",sum,(N/2)*(N+1));
+    }
+
+    MPI_Finalize();
+}
diff --git a/L21/key/mpi_sum_v2.c b/L21/key/mpi_sum_v2.c
new file mode 100644
index 0000000000000000000000000000000000000000..4bc791cb352f35fff2b942b840d8aa02e3abaea5
--- /dev/null
+++ b/L21/key/mpi_sum_v2.c
@@ -0,0 +1,71 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <mpi.h>
+
+int main(int argc, char *argv[]) {
+
+    MPI_Init (&argc, &argv);
+
+    // MPI_COMM_WORLD is the default communicator that contains all ranks
+    int rank, size;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    char node_name[MPI_MAX_PROCESSOR_NAME];
+    int node_name_len;
+    MPI_Get_processor_name(node_name,&node_name_len);
+
+    // make sure we are not running on a login node!
+    if ((strcmp(node_name,"tinkercliffs1") == 0) || 
+            (strcmp(node_name,"tinkercliffs2") == 0)) {
+        printf ("error : running on login node %s!\n",node_name);
+        return 1;
+    }
+
+    // get N from command line
+    if (argc < 2) {
+        printf ("Command usage : %s %s\n",argv[0],"N");
+        return 1;
+    }
+    long long N = atoll(argv[1]);
+
+    // start the timer
+    double start_time, end_time;
+    start_time = MPI_Wtime();
+
+    // calculate the sum
+    long long sum = 0;
+    for (long long i = 1+rank; i <= N;i+=size) {
+        sum += i;
+    }
+
+    // use parallel message passing to reduce the partial sums with result on rank 0
+    // we assume that size = 2^k for some integer k >= 0
+    int alive = size;
+    while (alive > 1) {
+	if (rank < alive/2) {
+	    // rank is a receiver
+	    long long rank_sum;
+	    MPI_Status status;
+	    int src = rank + alive/2;
+            MPI_Recv (&rank_sum, 1, MPI_LONG_LONG, src, 0, MPI_COMM_WORLD, &status);
+            sum += rank_sum;
+	} else if (rank < alive) {
+	    // rank is a sender */
+	    int dest = rank - alive/2;
+	    MPI_Send (&sum, 1, MPI_LONG_LONG, dest, 0, MPI_COMM_WORLD);
+	}
+	alive = alive/2;
+    }
+
+    // stop the timer
+    end_time = MPI_Wtime();
+
+    // print results
+    if (rank == 0) {
+        printf ("elapsed time = %.4f seconds\n",end_time-start_time);
+	printf ("sum = %lld, N*(N+1)/2 = %lld\n",sum,(N/2)*(N+1));
+    }
+
+    MPI_Finalize();
+}