Lecture Notes: 03-02 Data Races

Sum101

#include <stdio.h>
#include <assert.h>

// A billion.
const long TOP = 1000000000;

int
main(int _ac, char* _av[])
{
    printf("Summing numbers divisible by 101 from 0 to %ld.\n", TOP - 1);

    long sum = 0;
    for (long ii = 0; ii < TOP; ++ii) {
        if (ii % 101 == 0) {
            sum += ii;
        }
    }

    printf("Sum = %ld\n", sum);
    return 0;
}

Parallel version

#include <stdio.h>
#include <assert.h>

#include <sys/mman.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/wait.h>

int
main(int _ac, char* _av[])
{
    // A billion.
    const long TOP = 1000000000;
    const long NPP = TOP / 10;

    printf("Summing numbers divisible by 101 from 0 to %ld.\n", TOP - 1);

    long* sum = mmap(0, sizeof(long), PROT_READ|PROT_WRITE, 
                     MAP_SHARED|MAP_ANONYMOUS, -1, 0);

    pid_t kids[10];

    for (int pp = 0; pp < 10; ++pp) {
        if ((kids[pp] = fork())) {
            // do nothing
        }
        else {
            int i0 = NPP*pp;
            int iN = NPP*pp + NPP;

            for (int ii = i0; ii < iN; ++ii) {
                if (ii % 101 == 0) {
                    *sum += ii;
                }
            }

            munmap(sum, sizeof(long));
            exit(0);
        }
    }

    for (long pp = 0; pp < 10; ++pp) {
        waitpid(kids[pp], 0, 0);
    }

    printf("Sum = %ld\n", *sum);

    munmap(sum, sizeof(long));
    return 0;
}

Add a lock:

#include <stdio.h>
#include <assert.h>
#include <sys/mman.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/wait.h>

#include <semaphore.h>

void*
malloc_shared(size_t size)
{
    return mmap(0, size, PROT_READ|PROT_WRITE, 
                MAP_SHARED|MAP_ANONYMOUS, -1, 0);
}

void
free_shared(void* ptr, size_t size)
{
    munmap(ptr, size);
}

int
main(int _ac, char* _av[])
{
    // A billion.
    const long TOP = 1000000000;
    const long NPP = TOP / 10;

    printf("Summing numbers divisible by 101 from 0 to %ld.\n", TOP - 1);

    long* sum = malloc_shared(sizeof(long));

    sem_t* lock = malloc_shared(sizeof(sem_t));
    sem_init(lock, 1, 1);
    // Semaphores?

    pid_t kids[10];

    for (int pp = 0; pp < 10; ++pp) {
        if ((kids[pp] = fork())) {
            // do nothing
        }
        else {
            int i0 = NPP*pp;
            int iN = NPP*pp + NPP;

            for (int ii = i0; ii < iN; ++ii) {
                // try wait here
                if (ii % 101 == 0) {
                    sem_wait(lock);
                    *sum += ii;
                    sem_post(lock);
                }
                // and post here
            }

            free_shared(sum, sizeof(long));
            free_shared(lock, sizeof(sem_t));
            exit(0);
        }
    }

    for (long pp = 0; pp < 10; ++pp) {
        waitpid(kids[pp], 0, 0);
    }

    printf("Sum = %ld\n", *sum);

    free_shared(sum, sizeof(long));
    free_shared(lock, sizeof(sem_t));
    return 0;
}

Real work

#include <stdio.h>
#include <assert.h>
#include <sys/mman.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/wait.h>

#include <semaphore.h>

void*
malloc_shared(size_t size)
{
    return mmap(0, size, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0);
}

void
free_shared(void* ptr, size_t size)
{
    munmap(ptr, size);
}

int
main(int _ac, char* _av[])
{
    sem_t* locks = malloc_shared(2 * sizeof(sem_t));
    sem_t* aa = &(locks[0]);
    sem_t* bb = &(locks[1]);

    sem_init(aa, 1, 1);
    sem_init(bb, 1, 1);

    int cpid;
    if ((cpid = fork())) {
        printf("In parent\n");

        sem_wait(aa);
        sleep(1);
        sem_wait(bb);

        printf("Doing real work in parent...");

        sem_post(bb);
        sem_post(aa);

        waitpid(cpid, 0, 0);
    }
    else {
        printf("In child\n");

        sem_wait(bb);
        sleep(1);
        sem_wait(aa);

        printf("Doing real work in child...\n");

        sem_post(bb);
        sem_post(aa);
    }

    free_shared(locks, sizeof(long));
    return 0;
}

Virtual Memory and Fork: A Review
#

Draw the virtual memory diagram.
Allocate some shared memory.
Fork.
Point out that shared memory is shared, and non-shared writable memory soon isn’t.

Introducing Threads
#

// create.c
#include <stdio.h>
#include <pthread.h>
#include <assert.h>

#define NN 10

void*
thread_main(void* thread_arg)
{
    int xx = *((int*)thread_arg);
    printf("thread %d: We're in a thread.\n", xx);
    *((int*)thread_arg) += xx;
    return thread_arg;
}

int
main(int _argc, char* _argv[])
{
    int nums[NN];
    int rv;
    pthread_t threads[NN];

    printf("main: Starting %d threads.\n", NN);

    for (int ii = 0; ii < NN; ++ii) {
        nums[ii] = ii;

        rv = pthread_create(&(threads[ii]), 0, thread_main, &(nums[ii]));
        assert(rv == 0);
    }

    printf("main: Started %d threads.\n", NN);

    for (int ii = 0; ii < NN; ++ii) {
        void* ret;
        rv = pthread_join(threads[ii], &ret);

        int yy = *((int*) ret);
        printf("main: Joined thread %d, rv = %d.\n", ii, yy);
    }

    printf("main: All threads joined.\n");

    return 0;
}

show create.c
Discuss how threads change the virtual memory story.

Threads vs. Processes
#

We can spawn multiple processes with fork()
We can execute multiple threads within a single process.

Key difference: With threads, all memory is shared by default.

Advantage: Allocating shared memory post-spawn.
Disadvantage: 100% data races

History
#

Early days
#

Before multi-processor systems parallelism didn’t matter.
Concurrency was still useful though:
- Running multiple programs at once.
- Having multiple logical tasks happening within one program.
On Unix style systems, processes were commonly used for concurrency.
On early Windows / Mac systems, concurrency within a program was represented by cooperative threading:
- One thread could run at a time.
- To let other threads run, explicitly call yield()
- Some systems had an implicit yield when a thread blocked on I/O.
By the 90’s, systems had some sort of pre-emptive threading. This still didn’t work in parallel, but it would automatically schedule work between threads without explicit yield() calls.

Multiprocessors
#

Multiprocessor servers became widely available in the mid 90’s.
Windows and Solaris had decent parallel thread support.
Linux didn’t get fully functional threads until like 2002, so fork() was heavily optimized instead.
Result: Threads are much more efficient than processes on Windows.
Threads under Linux evolved from fork(), so the performance difference is small.
Multi-core desktop processors showed up around 2005, and suddenly parallelism became nessisary for performance.

Author

Nat Tuck

Virtual Memory and Fork: A Review #

Introducing Threads #

Threads vs. Processes #

History #

Early days #

Multiprocessors #

Virtual Memory and Fork: A Review
#

Introducing Threads
#

Threads vs. Processes
#

History
#

Early days
#

Multiprocessors
#