/* MPI parts for migrate
   started November 2000, Seattle
   Peter Beerli beerli@csit.fsu.edu
 
   
Copyright 2001 Peter Beerli and Joseph Felsenstein
 
 This software is distributed free of charge for non-commercial use
 and is copyrighted. Of course, we do not guarantee that the software
 works and are not responsible for any damage you may cause or have.
 
 
$Id: migrate_mpi.c,v 1.32 2003/11/04 16:12:09 beerli Exp $
*/
#ifdef MPI
#include "definitions.h"
#include "sighandler.h"
#include "migrate_mpi.h"
#include "broyden.h"
#include "combroyden.h"
#include "gammalike.h"
#include "profile.h"
#include "options.h"
#include "world.h"
#include "joint-chains.h"
#include "data.h"
#include "laguerre.h"
/*should go into profile.h*/
#define GRIDSIZE 9


extern void run_locus (world_fmt ** universe, int usize,
                           option_fmt * options, data_fmt * data,
                           tpool_t * heating_pool, long maxreplicate,
                           long locus, long *treefilepos, long *Gmax);

long pack_databuffer (char **buffer, data_fmt * data, option_fmt * options);
void unpack_databuffer (char *buffer, data_fmt * data, option_fmt * options);
void pack_allele_data (char **buffer, long *bufsize, data_fmt * data,
                       long pop, long ind);
void pack_sequence_data (char **buffer, long *bufsize, data_fmt * data,
                         long pop, long ind, long locus);
void mpi_gradient_master (nr_fmt * nr, world_fmt * world, int *who);
void mpi_results_master (double sendtype, world_fmt * world,
                         long maxrep,
                         void (*unpack) (char *buffer, world_fmt * world,
                                         long locus, long maxrep,
                                         long numpop));

void mpi_results_worker (long bufs, world_fmt * world,
                         long maxrep,
                         long (*pack) (char **buffer, world_fmt * world,
                                       long locus, long maxrep, long numpop));
void assignloci_worker (world_fmt * world);
void swap_atl (long from, long to, world_fmt * world);
long pack_quantile (char **buffer, quantile_fmt quant, long n);
void unpack_quantile (char *buffer, quantile_fmt quant, long n);

void handle_message(char *rawmessage,int sender, world_fmt *world);


void set_filehandle(char *message, world_fmt *world,
                    FILE **file, long *msgstart);

void
mpi_runloci_master (long loci, int *who, world_fmt *world)
{
    long locus;
    int sender = 0;
    long locusdone = -1;
    MPI_Status status;
    long numsent = 0;
    boolean done = FALSE;
    char *tempstr;
    long tempstrsize=MAXBUFSIZE;

    tempstr = (char *) calloc(MAXBUFSIZE,sizeof(char));

    for (locus = 0; locus < MIN (loci, numcpu - 1); locus++)
    {
        MYMPISEND (&locus, 1, MPI_LONG, locus + 1, locus + 1, comm_world);
        numsent++;
    }
    for (locus = 0; locus < loci; locus++)
    {
        done=FALSE;
        while(!done)
        {
            MYMPIRECV (tempstr, SMALLBUFSIZE, MPI_CHAR, MPI_ANY_SOURCE,MPI_ANY_TAG, comm_world, &status);
            sender = status.MPI_SOURCE;
//            fprintf(stdout,"%i> @@@@%s@@@@\n",sender, rawmessage);
            switch(tempstr[0])
            {
            case 'M':
                tempstrsize = atol(tempstr+1);
                MYMPIRECV (tempstr, tempstrsize, MPI_CHAR, sender, MPI_ANY_TAG,
                           comm_world, &status);
                handle_message(tempstr,sender, world);
                break;
            case 'R':
                //ignore first character and translate into locusnumber
                locusdone = atol(tempstr+1);
                done=TRUE;
                break;
            default:
                error("DIED because of wrong message from worker");
                break;
            }
        }
        who[locusdone] = sender;
        if (numsent < loci)
        {
            MYMPISEND (&numsent, 1, MPI_LONG, sender, numsent + 1, comm_world);
            numsent++;
        }
        else
        {
            MYMPISEND (&loci, 1, MPI_LONG, sender, 0, comm_world); //stop workers to wait for new loci
        }
    }
    //stop worker that had never the chance to work on a locus, but are still
    //listening
    for (sender = MIN (loci, numcpu - 1) + 1; sender < numcpu; sender++)
    {
        MYMPISEND (&loci, 1, MPI_LONG, sender, 0, comm_world); //stop worker to wait for new loci
    }
}


void
mpi_runloci_worker (world_fmt ** universe, int usize,
                    option_fmt * options, data_fmt * data,
                    tpool_t * heating_pool, long maxreplicate,
                    long *treefilepos, long *Gmax)
{
    boolean done = FALSE;
    long locus;
    char *rawmessage;
    long rawmsgsize = 0;
    MPI_Status status;
    rawmessage = (char *) calloc(STRSIZE,sizeof(char));
    while (!done)
    {
        MYMPIRECV (&locus, 1, MPI_LONG, MASTER, MPI_ANY_TAG,
                  comm_world, &status);
        if (status.MPI_TAG != 0) //stop condition
        {
            run_locus (universe, usize, options, data,
                       heating_pool, maxreplicate, locus, treefilepos, Gmax);
            rawmsgsize = sprintf(rawmessage,"R%li",locus) + 1;
            MYMPISEND (rawmessage, rawmsgsize, MPI_CHAR, MASTER, locus + 1, comm_world);
            /* we want to know what locus we worked for
               - to control the work sent by master
               - to use in setup_parameter0() [combroyden2.c] */
            universe[0]->who[locidone++] = locus;
        }
        else
        {
            done = TRUE;
        }
    }
}

#if 0
//---------replication in MPI
void
mpi_runreplicates_master (long loci, int *who, world_fmt *world)
{
    long locrep;
    int sender = 0;
    long locusdone = -1;
    MPI_Status status;
    long numsent = 0;
    boolean done = FALSE;
    char *tempstr;
    long tempstrsize=MAXBUFSIZE;
    long temp[2];
    long  maxreplicate = (world->options->replicate
                          && world->options->replicatenum >
                          0) ? world->options->replicatenum : 1;
    long locirep = loci * maxreplicate;
    
    tempstr = (char *) calloc(MAXBUFSIZE,sizeof(char));
    // loop over loci and replicates, first loop is only over numcpu 
    for (locrep = 0; locrep < MIN (locirep, numcpu - 1); locrep++)
      {
        temp[0] = (long) (locrep / maxreplicate); // locus to work on
        temp[1] = (long) (locrep % maxreplicate;  // replicate to work with
        MYMPISEND (temp, 2, MPI_LONG, locrep + 1, locrep + 1, comm_world);
        numsent++;
      }
    // loope over all loci and replicates from numcpu to loci*replicates
    for (locrep = 0; locrep < locirep; locrep++)
      {
        done=FALSE;
        while(!done)
          {
            MYMPIRECV (tempstr, SMALLBUFSIZE, MPI_CHAR, MPI_ANY_SOURCE, MPI_ANY_TAG,
                      comm_world, &status);
            sender = status.MPI_SOURCE;
            switch(rawmessage[0])
              {
                case 'M':
                    tempstrsize = atol(tempstr+1);
                    MYMPIRECV (tempstr, tempstrsize, MPI_CHAR, sender, MPI_ANY_TAG,
                               comm_world, &status);
                    handle_message(tempstr,sender, world);
                    break;
                case 'R':
                    //ignore first character and translate into locusnumber
                    sscanf(tempstr,"R%li:%li",&locusdone,$replicatedone);
                    sender = status.MPI_SOURCE;
                    done=TRUE;
                    break;
                default:
                    error("DIED because of wrong message from worker");
                    break;
              }
          }
        who[locusdone][0] = sender;
        who[locusdone][1] = replicatedone;
        if (numsent < locirep)
          {
            temp[0] = (long) (numsent / maxreplicate); // locus to work on
            temp[1] = (long) (numsent % maxreplicate;  // replicate to work with
            MYMPISEND (temp, 2, MPI_LONG, sender, numsent + 1, comm_world);
            numsent++;
          }
        else
          {
            temp[0] = loci;
            temp[1] = -1;
            //tell workers to stop waiting for new loci
            MYMPISEND (temp, 2, MPI_LONG, sender, 0, comm_world);
          }
      }
    //tell worker who had never the chance to work on a locus, but are still
    //listening, that they can stop waiting
    temp[0] = loci;
    temp[1] = -1;
    for (sender = MIN (locirep, numcpu - 1) + 1; sender < numcpu; sender++)
      {
        MYMPISEND (temp, 2, MPI_LONG, sender, 0, comm_world);
      }
}


void
mpi_runreplicates_worker (world_fmt ** universe, int usize,
                    option_fmt * options, data_fmt * data,
                    tpool_t * heating_pool, long maxreplicate,
                    long *treefilepos, long *Gmax)
{
    boolean done = FALSE;
    long locus;
    char *rawmessage;
    long rawmsgsize = 0;
    MPI_Status status;
    rawmessage = (char *) calloc(STRSIZE,sizeof(char));
    while (!done)
      {
        MYMPIRECV (&locus, 1, MPI_LONG, MASTER, MPI_ANY_TAG,
                  comm_world, &status);
        if (status.MPI_TAG != 0) //stop condition
          {
            run_locus (universe, usize, options, data,
                       heating_pool, maxreplicate, locus, treefilepos, Gmax);
            rawmsgsize = sprintf(rawmessage,"R%li",locus) + 1;
            MYMPISEND (rawmessage, rawmsgsize, MPI_CHAR, MASTER, locus + 1, comm_world);
            /* we want to know what locus we worked for
                - to control the work sent by master
                - to use in setup_parameter0() [combroyden2.c] */
            universe[0]->who[locidone++] = locus;
          }
        else
          {
            done = TRUE;
          }
      }
    // transfer data to sub-master for summarizing
    // receive from master to whom I need to send locus/replicate
    // send replicate parts to sub-master
    // if submaster calculate over replicates
    // if submaster signal ready to master so that master can update who records
}

//---------end replication in MPI
#endif


double
mpi_likelihood_master (double *param, double *lparam,
                       world_fmt * world, nr_fmt * nr,
                       helper_fmt * helper, int *who)
{
    long locus, worker;
    long addon=1;
    long sender;
    MPI_Status status;
    double logres = 0.0;
    double *temp;
    double *tmp;
    int tag;

    long numelem = world->numpop2 + (world->options->gamma ? 1 : 0);
    long numelem2 = numelem * 2;
    tmp = (double *) calloc (world->loci, sizeof (double));
    temp = (double *) calloc (numelem2+2, sizeof (double));
    temp[0] = MIGMPI_LIKE;
    memcpy (temp + 1, param, numelem * sizeof (double));
    memcpy (temp + 1 + numelem, lparam, numelem * sizeof (double));
    memset (nr->locilikes, 0, sizeof (double) * world->loci);
    if(world->loci==1)
        addon=0;
    else
        addon=1;
    for (worker = 1; worker < MIN (world->loci + addon, numcpu); worker++)
    {
        MYMPISEND (temp, (int) numelem2+2, MPI_DOUBLE, worker, worker, comm_world);
    }
    for (worker = 1; worker < MIN (world->loci + addon, numcpu); worker++)
    {
        MYMPIRECV (tmp, (int) world->loci, MPI_DOUBLE, MPI_ANY_SOURCE,
                  MPI_ANY_TAG, comm_world, &status);
        sender = status.MPI_SOURCE;
        tag = status.MPI_TAG;
        // the worker send a vector of values
        // e.g. (0,0,0,-12,0,-34,0,0,0), these are all loci
        // of which most of them were not evaluated
        // t he loop updates the master copy of locilikes
        for (locus = 0; locus < world->loci; locus++)
        {
            nr->locilikes[locus] += tmp[locus];
        }
    }
    for (locus = 0; locus < world->loci; locus++)
    {
        logres += nr->locilikes[locus];
    }
    nr->llike = logres;
    free (temp);
    return logres;
}


void
mpi_likelihood_worker (world_fmt * world, helper_fmt * helper, long rep)
{
    long locus, ww;
    nr_fmt *nr = helper->nr;
    double *param = helper->expxv;
    double *lparam = helper->xv;
    double *mu_rates = world->options->mu_rates;
    memset (nr->locilikes, 0, sizeof (double) * world->loci);
    if (world->options->gamma)
    {
        if (lparam[nr->numpop2] > 9.903487553)
        {
            lparam[nr->numpop2] = 9.903487553;
        }
        initgammacat (nr->categs, EXP (lparam[nr->numpop2]),1./* EXP (lparam[0])*/,
                      nr->rate, nr->probcat);
    }

    for (ww = 0; ww < locidone; ww++)
    {
        locus = nr->world->who[ww];
        if (!world->options->gamma)
        {
            nr->locilikes[locus] =
                calc_locus_like (nr, param, lparam, locus) + mu_rates[locus];
        }
        else
        {
            helper->locus = locus;
            helper->nr->locilikes[locus] = gamma_locus_like (nr,
                                           param,
                                           lparam,
                                           helper->weight,
                                           locus);
        }
    }
}

void
mpi_startparam_master(world_fmt * world)
{
    long sender;
    MPI_Status status;
    double  *tmp;
    int tag;
    long i;
    int numreceived = 0;
    long workerloci=0;
    // MYMPIBARRIER(comm_world);
    tmp = (double*) calloc(world->numpop2+1,sizeof(double));
    while (numreceived < world->loci)
      {
       // printf("wait for data \n");
        MYMPIRECV (tmp, world->numpop2+1, MPI_DOUBLE, MPI_ANY_SOURCE,
                   MPI_ANY_TAG, comm_world, &status);
        sender = status.MPI_SOURCE;
        tag = status.MPI_TAG;
        //printf("received data form node %li with tag %i\n",sender,tag);
        workerloci=tmp[0];
        for(i=0; i<world->numpop2; i++)
            world->param0[i] += tmp[i+1];
        numreceived+=workerloci;
      }
    for(i=0; i<world->numpop2; i++)
        world->param0[i] /= world->loci;
 
    free(tmp);
}

void
mpi_startparam_worker (world_fmt * world)
{
    long ww;
    long repstart;
    long repstop;
    long r;
    long i;
    long locus;
    double *tmp;
    // MYMPIBARRIER(comm_world);
    if(locidone>0)
      {
//        fprintf("send startparam from node %i\n",myID);
    tmp = (double*) calloc(world->numpop2+1,sizeof(double));
        set_replicates (world, world->repkind, world->options->replicatenum,
                        &repstart, &repstop);
        tmp[0]=(double)locidone;
        for (ww = 0; ww < locidone; ww++)
        {
            locus = world->who[ww];
            for (r = repstart; r < repstop; r++)
            {
                for(i=0; i < world->numpop2; i++)
                    tmp[i+1] += world->atl[r][locus].param[i];
            }
        }
        for(i=1; i < world->numpop2+1; i++)
            tmp[i] /= locidone * (repstop-repstart);
    MYMPISEND (tmp, world->numpop2+1, MPI_DOUBLE, MASTER, myID, comm_world);
    free(tmp);
}
}


void
mpi_gmax_master (world_fmt * world, long *Gmax)
{
    long sender;
    MPI_Status status;
    long tmp;
    int tag;
    int numreceived = 0;
    *Gmax = 0.;
    MYMPIBCAST (Gmax, 1, MPI_LONG, MASTER, comm_world);
    while (numreceived < numcpu - 1)
    {
        MYMPIRECV (&tmp, 1, MPI_LONG, MPI_ANY_SOURCE,
                  MPI_ANY_TAG, comm_world, &status);
        sender = status.MPI_SOURCE;
        tag = status.MPI_TAG;
        if (*Gmax < tmp)
            *Gmax = tmp;
        numreceived++;
    }
    //  do we need this barrier really?
    MYMPIBARRIER(comm_world);
}

void
mpi_gmax_worker (world_fmt * world)
{
    long ww;
    long repstart;
    long repstop;
    long r;
    long locus;
    long Gmax = 1;
    MYMPIBCAST (&Gmax, 1, MPI_LONG, MASTER, comm_world);
    set_replicates (world, world->repkind, world->options->replicatenum,
                    &repstart, &repstop);

    for (ww = 0; ww < locidone; ww++)
    {
        locus = world->who[ww];
        for (r = repstart; r < repstop; r++)
        {
            if (Gmax < world->atl[r][locus].T)
                Gmax = world->atl[r][locus].T;
        }
    }
    MYMPISEND (&Gmax, 1, MPI_LONG, MASTER, myID, comm_world);
//  do we need this barrier really?
    MYMPIBARRIER(comm_world);
  }


void
mpi_send_stop (world_fmt * world)
{
    long worker;
    double *temp;
    long numelem = world->numpop2 + (world->options->gamma ? 1 : 0);
    long numelem2 = 2 * numelem;
    temp = (double *) calloc (numelem2+2, sizeof (double));
    temp[0] = MIGMPI_END;
    for (worker = 1; worker < numcpu; worker++)
    {
        MYMPISEND (temp, (int) numelem2+2, MPI_DOUBLE, worker, 0, comm_world); //end of loci
    }
    free (temp);
}

void
mpi_send_stop_tag (int worker, world_fmt * world)
{
    double *temp;
    long numelem = world->numpop2 + (world->options->gamma ? 1 : 0);
    long numelem2 = 2 * numelem;
    temp = (double *) calloc (numelem2+2, sizeof (double));
    temp[0] = MIGMPI_END;
    MYMPISEND (temp, (int) numelem2+2, MPI_DOUBLE, worker, 0, comm_world); //end of loci
    free (temp);
}

void
mpi_results_stop (void)
{
    long worker;
    long dummy = 0;
    for (worker = 1; worker < numcpu; worker++)
    {
        MYMPISEND (&dummy, 1, MPI_LONG, worker, 0, comm_world);
    }
}

void
mpi_gradient_master (nr_fmt * nr, world_fmt * world, int *who)
{
    long locus;
    long addon=1;
    long sender;
    MPI_Status status;
    int tag;

    double *temp;
    long *tempindex;
    long numelem = nr->partsize;
    long numelem2 = 2 * numelem;
    temp = (double *) calloc (numelem2+2, sizeof (double));
    tempindex = (long *) calloc (numelem, sizeof (long));
    temp[0] = MIGMPI_GRADIENT;
    memcpy (temp + 1, nr->param, numelem * sizeof (double));
    memcpy (tempindex, nr->indeks, nr->partsize * sizeof (long));
    memcpy (temp + 1 + numelem, nr->lparam, numelem * sizeof (double));
    temp[numelem2+1] = nr->profilenum;
    addon = (world->loci == 1)? 0 : 1;
    for (locus = 1; locus < MIN (world->loci + addon, numcpu); locus++)
    {
        MYMPISEND (temp, (int) numelem2+2, MPI_DOUBLE, locus, locus, comm_world);
        MYMPISEND (tempindex, (int) numelem, MPI_LONG, locus, locus, comm_world);
    }
    memset (nr->d, 0, sizeof (double) * numelem);
    for (locus = 1; locus < MIN (world->loci + addon, numcpu); locus++)
    {
        copy_and_clear_d (nr);
        MYMPIRECV (nr->d, (int) numelem, MPI_DOUBLE, MPI_ANY_SOURCE,
                  MPI_ANY_TAG, comm_world, &status);
        add_back_d (nr);
        sender = status.MPI_SOURCE;
        tag = status.MPI_TAG;
    }
    free (temp);
}

void
mpi_gradient_worker (helper_fmt * helper, nr_fmt * nr,
                     timearchive_fmt ** tyme)
{
    long ww, locus;
    memset (nr->d, 0, sizeof (double) * nr->partsize);
    for (ww = 0; ww < locidone; ww++)
    {
        locus = nr->world->who[ww];
        if(!nr->world->options->gamma)
        {
            copy_and_clear_d (nr);
            simple_loci_derivatives (nr->d, nr, tyme, locus);
            add_back_d (nr);
        }
        else
        {
            gamma_locus_derivative (helper, locus);
        }
    }
}

void
mpi_maximize_worker (world_fmt * world, long rep)
{
    boolean done = FALSE;
    long locus;
    MPI_Status status;
    nr_fmt *nr;
    helper_fmt helper;
    long repstart, repstop, Gmax;
    long numelem =  world->numpop2 + (world->options->gamma ?  1 : 0) ;
    long numelem2 = numelem * 2;
    double *temp;
    temp = (double *) calloc (numelem2 + 2, sizeof (double));
    helper.xv = (double *) calloc (numelem2, sizeof (double));
    helper.expxv = (double *) calloc (numelem2, sizeof (double));
    nr = (nr_fmt *) calloc (1, sizeof (nr_fmt));
    set_replicates (world, world->repkind, rep, &repstart, &repstop);

    which_calc_like (world->repkind);
    MYMPIBCAST (&Gmax, 1, MPI_LONG, MASTER, comm_world);
    create_nr (nr, world, Gmax, 0, world->loci, world->repkind, repstart);
    SETUPPARAM0 (world, nr, world->repkind,
                 repstart, repstop, world->loci, MULTILOCUS, TRUE);
    while (!done)
    {
        MYMPIRECV (temp, (int) numelem2+2, MPI_DOUBLE, MASTER, MPI_ANY_TAG,
                  comm_world, &status);
        locus = world->locus = status.MPI_TAG - 1;
        switch ((long) temp[0])
        {
        case MIGMPI_LIKE:
            memset (nr->locilikes, 0, sizeof (double) * world->loci);
            memcpy (helper.expxv, temp + 1, sizeof (double) * numelem);
            memcpy (helper.xv, temp + 1 + numelem,
                    sizeof (double) * numelem);
            fill_helper (&helper, helper.expxv, helper.xv, world, nr);
            mpi_likelihood_worker (world, &helper, rep);
            MYMPISEND (nr->locilikes, (int) world->loci, MPI_DOUBLE, MASTER,
                      locus + 1, comm_world);
            break;
        case MIGMPI_GRADIENT:
            memcpy (nr->param, temp + 1, sizeof (double) * (numelem - 1));
            memcpy (nr->lparam, temp + 1 + numelem,
                    sizeof (double) * numelem);
            fill_helper (&helper, nr->param, nr->lparam, world, nr);
            nr->profilenum = temp[numelem2 + 1];
            MYMPIRECV (nr->indeks, (int) numelem, MPI_LONG, MASTER, MPI_ANY_TAG,
                      comm_world, &status);
            mpi_gradient_worker (&helper, nr, world->atl);
            MYMPISEND (nr->d, (int) numelem, MPI_DOUBLE, MASTER, locus + 1,
                      comm_world);
            break;
        case MIGMPI_RESULT:
            mpi_results_worker (temp[0], world, repstop, pack_result_buffer);
            break;
        case MIGMPI_SUMFILE:
            mpi_results_worker (temp[0], world, repstop, pack_sumfile_buffer);
            break;
        case MIGMPI_MIGHIST:
            mpi_results_worker (temp[0], world, repstop, pack_mighist_buffer);
            break;
        case MIGMPI_END:
            done = TRUE;
            break;
        default:
            fprintf (stdout, "%i> does not understand task\n", myID);
            exit (0);
        }
    }
    free(helper.xv);
    free(helper.expxv);
    destroy_nr (nr, world);
}

void
broadcast_options_master (option_fmt * options)
{
    long bufsize = MAXBUFSIZE;
    char *buffer;
    buffer = (char *) calloc (1, sizeof (char));
    bufsize = save_options_buffer (&buffer, options);
    // MYMPIBARRIER(comm_world);
    MYMPIBCAST (&bufsize, 1, MPI_LONG, MASTER, comm_world);
    MYMPIBCAST (buffer, bufsize, MPI_CHAR, MASTER, comm_world);
    free (buffer);
}


void
broadcast_options_worker (option_fmt * options)
{
    long bufsize;
    char *buffer, *sbuffer;
    // MYMPIBARRIER(comm_world);
    MYMPIBCAST (&bufsize, 1, MPI_LONG, MASTER, comm_world);
    buffer = (char *) calloc (bufsize + 1, sizeof (char));
    sbuffer = buffer;
    MYMPIBCAST (buffer, bufsize, MPI_CHAR, MASTER, comm_world);
    read_options_worker (&buffer, options);
    free (sbuffer);
}


void
broadcast_data_master (data_fmt * data, option_fmt * options)
{
    long bufsize;
    char *buffer;
    buffer = (char *) calloc (1, sizeof (char));
    bufsize = pack_databuffer (&buffer, data, options);
    // MYMPIBARRIER(comm_world);
    MYMPIBCAST (&bufsize, 1, MPI_LONG, MASTER, comm_world);
    MYMPIBCAST (buffer, bufsize, MPI_CHAR, MASTER, comm_world);
    free (buffer);
}

void
broadcast_data_worker (data_fmt * data, option_fmt * options)
{
    long bufsize;
    char *buffer;
    // MYMPIBARRIER(comm_world);
    MYMPIBCAST (&bufsize, 1, MPI_LONG, MASTER, comm_world);
    buffer = (char *) calloc (bufsize, sizeof (char));
    MYMPIBCAST (buffer, bufsize, MPI_CHAR, MASTER, comm_world);
    unpack_databuffer (buffer, data, options);
    free (buffer);
}

long
pack_databuffer (char **buffer, data_fmt * data, option_fmt * options)
{
    long locus, pop, ind;
    long bufsize = 0;
    long biggest;
#ifdef UEP

    long sumtips, i;
#endif

    char fp[LONGLINESIZE];
    bufsize += LONGLINESIZE;
    *buffer = (char *) realloc (*buffer, sizeof (char) * bufsize);
    bufsize += 1 + sprintf (fp, "%c %li %li %li\n", options->datatype, (long) data->hasghost,
                            data->numpop, data->loci);
    *buffer = (char *) realloc (*buffer, sizeof (char) * bufsize);
    strcat (*buffer, fp);
    for (locus = 0; locus < data->loci; locus++)
    {
        bufsize += 1 + sprintf (fp, "%li\n", data->seq->sites[locus]);
        *buffer = (char *) realloc (*buffer, sizeof (char) * bufsize);
        strcat (*buffer, fp);
    }
    bufsize += 1 + sprintf (fp, "%li %f\n", data->seq->addon, data->seq->fracchange);
    *buffer = (char *) realloc (*buffer, sizeof (char) * bufsize);
    strcat (*buffer, fp);
    // population data
    for (pop = 0; pop < data->numpop; pop++)
    {
        bufsize += 1 + sprintf (fp, "%s\n", data->popnames[pop]);
        *buffer = (char *) realloc (*buffer, sizeof (char) * bufsize);
        strcat (*buffer, fp);
        biggest = 0;
        for(locus=0; locus<data->loci; locus++)
        {
            bufsize += 1 + sprintf (fp, "%li %li\n", data->numind[pop][locus],data->numalleles[pop][locus]);
            if(biggest < data->numind[pop][locus])
                biggest = data->numind[pop][locus];
            *buffer = (char *) realloc (*buffer, sizeof (char) * bufsize);
            strcat (*buffer, fp);
        }
        if (!strchr (SEQUENCETYPES, options->datatype))
        {
            for (ind = 0; ind < biggest; ind++)
            {
                bufsize += 1 + sprintf (fp, "%*.*s\n", (int) options->nmlength,
                                        (int) options->nmlength, data->indnames[pop][ind]);
                *buffer = (char *) realloc (*buffer, sizeof (char) * bufsize);
                strcat (*buffer, fp);
                pack_allele_data (buffer, &bufsize, data, pop, ind);
            }
        }
        else
        {
            for(locus=0;locus<data->loci; ++locus)
            {
                for (ind = 0; ind < data->numind[pop][locus]; ind++)
                {
                    bufsize += 1 + sprintf (fp, "%*.*s\n", (int) options->nmlength, (int) options->nmlength,
                                            data->indnames[pop][ind]);
                    *buffer = (char *) realloc (*buffer, sizeof (char) * bufsize);
                    strcat (*buffer, fp);
                    pack_sequence_data (buffer, &bufsize, data, pop, ind, locus);
                }
            }
        }
    }
    // geofile
    if (options->geo)
    {
        for (pop = 0; pop < data->numpop * data->numpop; pop++)
        {
            bufsize += 1 + sprintf (fp, "%f %f\n", data->geo[pop], data->lgeo[pop]);
            *buffer = (char *) realloc (*buffer, sizeof (char) * bufsize);
            strcat (*buffer, fp);
        }
    }
    // uepfile
#ifdef UEP
    if (options->uep)
    {
        sumtips = 0;
        for (pop = 0; pop < data->numpop; ++pop)
            sumtips += data->numind[pop][0];//Assumes UEP is matched by locus 1
        bufsize += 1 + sprintf (fp, "%li %li\n", sumtips, data->uepsites);
        *buffer = (char *) realloc (*buffer, sizeof (char) * bufsize);
        strcat (*buffer, fp);
        if (strchr (SEQUENCETYPES, options->datatype))
        {
            for (pop = 0; sumtips; pop++)
            {
                for (i = 0; i < data->uepsites; i++)
                {
                    bufsize += 1 + sprintf (fp, "%i\n", data->uep[pop][i]);
                    *buffer =
                        (char *) realloc (*buffer, sizeof (char) * bufsize);
                    strcat (*buffer, fp);
                }
            }
        }
        else
        {
            for (pop = 0; sumtips; pop++)
            {
                for (i = 0; i < data->uepsites; i++)
                {
                    bufsize += 1 + sprintf (fp, "%i %i\n", data->uep[pop][i],
                                            data->uep[pop + sumtips][i]);
                    *buffer =
                        (char *) realloc (*buffer, sizeof (char) * bufsize);
                    strcat (*buffer, fp);
                }
            }
        }
    }

#endif
    return bufsize;
}

void
pack_allele_data (char **buffer, long *bufsize, data_fmt * data, long pop,
                  long ind)
{
    char fp[LONGLINESIZE];
    long locus;
    for (locus = 0; locus < data->loci; locus++)
    {
        *bufsize += 1 + sprintf (fp, "%s %s\n", data->yy[pop][ind][locus][0],
                                 data->yy[pop][ind][locus][1]);
        *buffer = (char *) realloc (*buffer, sizeof (char) * *bufsize);
        strcat (*buffer, fp);
    }
}

void
pack_sequence_data (char **buffer, long *bufsize, data_fmt * data, long pop,
                    long ind, long locus)
{
    char *fp;
    //  long locus;
    //  fp = calloc (1, sizeof (char));
    // for (locus = 0; locus < data->loci; locus++)
    //   {
    fp = (char *) calloc ((2 + data->seq->sites[locus]), sizeof (char));
    sprintf (fp, "%s\n", data->yy[pop][ind][locus][0]);
    *bufsize += 2 + data->seq->sites[locus];
    *buffer = (char *) realloc (*buffer, sizeof (char) * *bufsize);
    strcat (*buffer, fp);
    //   }
    free (fp);
}

// this function and get_data() do not mix well!
void
unpack_databuffer (char *buffer, data_fmt * data, option_fmt * options)
{
    long locus, pop, ind, i=0;
    long biggest;
#ifdef UEP

    long sumtips;
#endif

    char *buf = buffer;
    char *input;
    long hasghost;
    input = (char *) calloc (LONGLINESIZE, sizeof (char));
    sgets (input, LONGLINESIZE, &buf);
    sscanf (input, "%c%li%li%li", &options->datatype, &hasghost, &data->numpop,
            &data->loci);
    data->hasghost = (boolean) hasghost;
    init_data_structure1 (&data, options);
    for (locus = 0; locus < data->loci; locus++)
    {
        sgets (input, LONGLINESIZE, &buf);
        sscanf (input, "%li", &data->seq->sites[locus]);
    }
    sgets (input, LONGLINESIZE, &buf);
    sscanf (input, "%li%lf", &data->seq->addon, &data->seq->fracchange);
    // population data
    for (pop = 0; pop < data->numpop; pop++)
    {
        sgets (input, LONGLINESIZE, &buf);
        sscanf (input, "%s", data->popnames[pop]);
        biggest=0;
        for(locus=0; locus<data->loci; locus++)
        {
            sgets (input, LONGLINESIZE, &buf);
            sscanf (input, "%li %li", &data->numind[pop][locus],&data->numalleles[pop][locus]);
            if(biggest<data->numind[pop][locus])
                biggest = data->numind[pop][locus];
        }
        init_data_structure2 (&data, options, pop);
        if (!strchr (SEQUENCETYPES, options->datatype))
        {
            for (ind = 0; ind < biggest; ind++)
            {
                sgets (input, LONGLINESIZE, &buf);
                sscanf (input, "%s", data->indnames[pop][ind]);
                for (locus = 0; locus < data->loci; locus++)
                {
                    sgets (input, LONGLINESIZE, &buf);
                    sscanf (input, "%s %s", data->yy[pop][ind][locus][0],
                            data->yy[pop][ind][locus][1]);
                }
            }
        }
        else
        {
            for (locus = 0; locus < data->loci; locus++)
            {
                for (ind = 0; ind < data->numind[pop][locus]; ind++)
                {
                    sgets (input, LONGLINESIZE, &buf);
                    sscanf (input, "%s", data->indnames[pop][ind]);
                    input =(char *) realloc (input, sizeof (char) * (100 + data->seq->sites[locus]));
                    sgets (input, 100 + data->seq->sites[locus], &buf);
                    sscanf (input, "%s", data->yy[pop][ind][locus][0]);
                }
            }
        }
    }
    // geofile
    data->geo =
        (double *) calloc (1, sizeof (double) * data->numpop * data->numpop);
    data->lgeo =
        (double *) calloc (1, sizeof (double) * data->numpop * data->numpop);
    if (!options->geo)
    {
        for (i = 0; i < data->numpop * data->numpop; i++)
            data->geo[i] = 1.0;
    }
    else
    {
        for (pop = 0; pop < data->numpop * data->numpop; pop++)
        {
            sgets (input, LONGLINESIZE, &buf);
            sscanf (input, "%lf%lf", &data->geo[pop], &data->lgeo[pop]);
        }
    }
    // uepfile
#ifdef UEP
    if (options->uep)
    {
        sgets (input, LONGLINESIZE, &buf);
        sscanf (input, "%li%li", &sumtips, &data->uepsites);
        data->uep =
            (int **) calloc (number_genomes (options->datatype) * sumtips,
                             sizeof (int *));
        if (strchr (SEQUENCETYPES, options->datatype))
        {
            for (pop = 0; sumtips; pop++)
            {
                data->uep[i] = (int *) calloc (data->uepsites, sizeof (int));
                for (i = 0; i < data->uepsites; i++)
                {
                    sgets (input, LONGLINESIZE, &buf);
                    sscanf (input, "%i", &data->uep[pop][i]);
                }
            }
        }
        else
        {
            for (pop = 0; sumtips; pop++)
            {
                data->uep[i] = (int *) calloc (data->uepsites, sizeof (int));
                data->uep[i + sumtips] =
                    (int *) calloc (data->uepsites, sizeof (int));
                for (i = 0; i < data->uepsites; i++)
                {
                    sgets (input, LONGLINESIZE, &buf);
                    sscanf (input, "%i%i", &data->uep[pop][i],
                            &data->uep[pop + sumtips][i]);
                }
            }
        }

    }
#endif
    init_data_structure3 (data);
    /* replace this, if loci can have different number of ind */
#ifndef NEWSTYLE

    set_numind (data);
#endif

    switch (options->datatype)
    {
    case 'a':
        create_alleles (data);
        break;
    case 'b':
        for (pop = 0; pop < data->loci; pop++)
            data->maxalleles[pop] = XBROWN_SIZE;
        break;
    case 'm':
        create_alleles (data);
        for (pop = 0; pop < data->loci; pop++)
            data->maxalleles[pop] = options->micro_stepnum;
        break;
    }
    free (input);
}

void
unpack_result_buffer (char *buffer, world_fmt * world,
                      long locus, long maxrep, long numpop)
{
    long rep, pop;
    long addon=0;
    timearchive_fmt **atl = world->atl;
    double ***apg0 = world->apg0;
    char *input;
    char *buf = buffer;
    input = (char*) calloc(LONGLINESIZE,sizeof(char));
    if (maxrep>1)
        addon=1;
    for (rep = 0; rep < maxrep + addon; rep++)
    {
        sgets (input, LONGLINESIZE, &buf);
        atl[rep][locus].param_like = atof (input);
        for (pop = 0; pop < 4 * numpop * numpop; pop++)
        {
            sgets (input, LONGLINESIZE, &buf);
            atl[rep][locus].parameters[pop] = atof (input);
        }
    }
    // apg0
    for (rep = 0; rep < maxrep; rep++)
    {
        for (pop = 0; pop < world->options->lsteps; pop++)
        {
            sgets (input, LONGLINESIZE, &buf);
            apg0[rep][locus][pop] = atof (input);
        }
    }
    free(input);
}

long
pack_result_buffer (char **buffer, world_fmt * world,
                    long locus, long maxrep, long numpop)
{
    long rep, pop;
    char *input;
    long addon=0;
    timearchive_fmt **atl = world->atl;
    double ***apg0 = world->apg0;
    long bufsize = 1;  //maxrep * (1 + 4*numpop*numpop + world->options->lsteps) * 42 ;
    input = (char*) calloc(LONGLINESIZE,sizeof(char));
    (*buffer) = (char *) realloc (*buffer, sizeof (char) * bufsize);
    memset (*buffer, 0, sizeof (char) * bufsize);
    strcpy (input, "");
    if (maxrep>1)
        addon=1;

    for (rep = 0; rep < maxrep + addon; rep++)
    {
        bufsize += 1 + sprintf (input, "%20.20f\n", atl[rep][locus].param_like);
        (*buffer) = (char *) realloc (*buffer, sizeof (char) * bufsize);
        strcat (*buffer, input);
        for (pop = 0; pop < 4 * numpop * numpop; pop++)
        {
            bufsize += 1 + sprintf (input, "%20.20f\n", atl[rep][locus].parameters[pop]);
            (*buffer) = (char *) realloc (*buffer, sizeof (char) * bufsize);
            strcat (*buffer, input);
        }
    }
    // apg0
    for (rep = 0; rep < maxrep; rep++)
    {
        for (pop = 0; pop < world->options->lsteps; pop++)
        {
            bufsize += 1 + sprintf (input, "%20.20f\n", apg0[rep][locus][pop]);
            (*buffer) = (char *) realloc (*buffer, sizeof (char) * bufsize);
            strcat (*buffer, input);
        }
    }
    free(input);
    return bufsize;
}

void
unpack_mighist_buffer (char *buffer, world_fmt * world,
                       long locus, long maxrep, long numpop)
{
    long i, j;
    mighistloci_fmt *aa;
    char *input;
    char *buf = buffer;
    aa = &world->mighistloci[locus];
    input = (char*) calloc(LONGLINESIZE,sizeof(char));
    sgets (input, LONGLINESIZE, &buf);
    sscanf (input, "%li", &aa->mighistnum);
    if(aa->allocsize <= aa->mighistnum)
    {
        aa->mighist = (mighist_fmt *) realloc (aa->mighist, sizeof (mighist_fmt) *(aa->mighistnum+1));
        for(j=aa->allocsize; j<=aa->mighistnum; j++)
        {
            aa->mighist[j].allocsize=1;
            aa->mighist[j].migeventsize=0;
            aa->mighist[j].migevents =
                (migevent_fmt *) calloc (1,  sizeof (migevent_fmt) );
            //printf("%i> first events: alloc=%li size=%li\n", myID, aa->mighist[j].allocsize , aa->mighist[j].migeventsize);
        }
        aa->allocsize = aa->mighistnum+1;
    }
    for (j = 0; j < aa->mighistnum; j++)
    {
        sgets (input, LONGLINESIZE, &buf);
        sscanf (input, "%li %li %li", &aa->mighist[j].copies,
                &aa->mighist[j].weight, &aa->mighist[j].migeventsize);
        //printf("%i> events: alloc=%li size=%li\n", myID, aa->mighist[j].allocsize , aa->mighist[j].migeventsize);
        aa->mighist[j].allocsize = aa->mighist[j].migeventsize;
        aa->mighist[j].migevents = (migevent_fmt *) realloc (aa->mighist[j].migevents,
                                   sizeof (migevent_fmt) *
                                   aa->mighist[j].allocsize);
        for (i = 0; i < aa->mighist[j].migeventsize; i++)
        {
            sgets (input, LONGLINESIZE, &buf);
            sscanf (input, "%lf %lf %lf",
                    &aa->mighist[j].migevents[i][0],
                    &aa->mighist[j].migevents[i][1],
                    &aa->mighist[j].migevents[i][2]);
        }
    }
    free(input);
}


long
pack_mighist_buffer (char **buffer, world_fmt * world,
                     long locus, long maxrep, long numpop)
{
    long j, i;
    mighistloci_fmt *aa;
    char *input;
    long bufsize = 1;
    aa = &world->mighistloci[locus];
    input = (char*) calloc(LONGLINESIZE,sizeof(char));
    for (j = 0; j < aa->mighistnum; j++)
        bufsize += 100 + aa->mighist[j].migeventsize * 100;
    (*buffer) = (char *) realloc ((*buffer), sizeof (char) * bufsize);
    memset (*buffer, 0, sizeof (char) * bufsize);
    sprintf (input, "%li\n", aa->mighistnum);
    strcat ((*buffer), input);

    for (j = 0; j < aa->mighistnum; j++)
    {
        sprintf (input, "%li %li %li\n", aa->mighist[j].copies,
                 aa->mighist[j].weight, aa->mighist[j].migeventsize);
        strcat ((*buffer), input);
        for (i = 0; i < aa->mighist[j].migeventsize; i++)
        {
            sprintf (input, "%f %f %f\n",
                     aa->mighist[j].migevents[i][0],
                     aa->mighist[j].migevents[i][1],
                     aa->mighist[j].migevents[i][2]);
            strcat ((*buffer), input);
        }
    }
    free(input);
    return strlen (*buffer) + 1;
}

// there are differences between unpack_sumfile() and and read_savesum()
// this needs reconciliation
void
unpack_sumfile_buffer (char *buffer, world_fmt * world,
                       long locus, long maxrep, long numpop)
{
    long r, i, j;
    long l = locus;
    char *input;
    timearchive_fmt **ta = world->atl;
    char *buf = buffer;
    input = (char*) calloc(LONGLINESIZE*world->numpop2,sizeof(char));
    for (r = 0; r < maxrep; r++)
    {
        sgets (input, LONGLINESIZE, &buf);
        sscanf (input, "%li %li %li %lg", &ta[r][l].T, &ta[r][l].numpop,
                &ta[r][l].sumtips, &ta[r][locus].param_like);
        world->chainlikes[l][r] = ta[r][l].param_like;

        increase_timearchive (world, l, ta[r][l].T, world->numpop, r);
        for (i = 0; i < ta[r][l].T; i++)
        {
            sgets (input, LONGLINESIZE, &buf);
            sscanf (input, "%li %lg", &ta[r][l].tl[i].copies, &ta[r][l].tl[i].lcopies);
            for (j = 0; j < 3 * numpop + numpop * (numpop - 1); j++)
            {
                sgets (input, LONGLINESIZE, &buf);
                sscanf (input, "%lg", &ta[r][l].tl[i].data[j]);
            }
        }
        for (i = 0; i < world->numpop2; i++)
        {
            sgets (input, LONGLINESIZE, &buf);
            sscanf (input, "%lg %lg", &ta[r][l].param[i], &ta[r][l].param0[i]);
        }
        log_param0 (ta[r][l].param0, ta[r][l].lparam0, world->numpop2);
        sgets (input, LONGLINESIZE, &buf);
        sscanf (input, "%li %lg", &ta[r][l].trials, &ta[r][l].normd);
    }
    free(input);
}


long
pack_sumfile_buffer (char **buffer, world_fmt * world,
                     long locus, long maxrep, long numpop)
{
    long r, i, j;
    long numpop2 = numpop * numpop;
    long l = locus;
    char *input;
    char *tempbuffer;
    char *tempbuffer2;
    timearchive_fmt **ta = world->atl;
    // calc bufsize needed
    long bufsize = 1;
    long allocbufsize = 0;
    for (r = 0; r < maxrep; r++)
    {
        allocbufsize += 100 + ta[r][l].T * (22+ 3 * numpop + numpop * (numpop - 1) * 22 + 50 * numpop * numpop + 50);
    }
    (*buffer) = (char *) realloc ((*buffer), allocbufsize * sizeof (char));
    tempbuffer = (char *) calloc((22+ 3 * numpop + numpop * (numpop - 1) * 22 + 50 * numpop * numpop + 50),sizeof(char));
    tempbuffer2 = (char *) calloc(101* (22+ 3 * numpop + numpop * (numpop - 1) * 22 + 50 * numpop * numpop + 50),sizeof(char));
    (*buffer)[0] = '\0';
    //memset (*buffer, 0, sizeof (char));
    input = (char*) calloc(LONGLINESIZE*world->numpop2,sizeof(char));
    for (r = 0; r < maxrep; r++)
    {
        sprintf (input, "%li %li %li %g \n", ta[r][l].T,
                 ta[r][l].numpop, ta[r][l].sumtips, ta[r][l].param_like);
        //      bufsize += strlen (input) + 1;
        //      (*buffer) = (char *) realloc (*buffer, sizeof (char) * bufsize);
        strcat ((*buffer), input);
        memset(tempbuffer2,0,sizeof(char)*101*(22+ 3 * numpop + numpop * (numpop - 1) * 22 + 50 * numpop * numpop + 50));
        for (i = 0; i < ta[r][l].T; i++)
        {
            //   memset(tempbuffer,0,sizeof(char)*(22+ 3 * numpop + numpop * (numpop - 1) * 22 + 50 * numpop * numpop + 50));
            tempbuffer[0]='\0';
            sprintf (input, "%li %g\n", ta[r][l].tl[i].copies, ta[r][l].tl[i].lcopies);
            strcat (tempbuffer, input);
            //       printf("%5li %5li\n",i,j);
            for (j = 0; j < 3 * numpop + numpop * (numpop - 1); j++)
            {
                sprintf (input, "%g\n", ta[r][l].tl[i].data[j]);
                strcat (tempbuffer, input);
            }
            strcat(tempbuffer2,tempbuffer);
            if((i+1) % 100 == 0)
            {
                strcat(*buffer,tempbuffer2);
                tempbuffer2[0] = '\0';
            }
        }
        strcat (*buffer, tempbuffer2);
        tempbuffer[0]='\0';
        tempbuffer2[0]='\0';
        for (i = 0; i < numpop2; i++)
        {
            sprintf (input, "%g %g\n", ta[r][l].param[i], ta[r][l].param0[i]);
            strcat (tempbuffer, input);
        }
        sprintf (input, "%li %g\n", ta[r][l].trials, ta[r][l].normd);
        strcat (tempbuffer, input);
        //   strcat(tempbuffer2,tempbuffer);
        strcat(*buffer,tempbuffer);
    }
    bufsize = strlen(*buffer);
    if(bufsize > allocbufsize)
        error("allocation exceeded in pack_sumfile_buffer");
    free(input);
    free(tempbuffer);
    free(tempbuffer2);
    return bufsize;
}

void
mpi_results_master (double sendtype, world_fmt * world, long maxreplicate,
                    void (*unpack) (char *buffer, world_fmt * world,
                                    long locus, long maxrep, long numpop))
{
    long numpop = world->numpop;
    int bufsize = 1;
    //  long addon = (sendtype == MIGMPI_SUMFILE) ? 0 : ((world->loci == 1) ? 0 : 1) ;
    boolean done = FALSE;
    char *buffer, *sbuffer=NULL;
    double *temp;
    int locus;
    long z, tag, sender;
    MPI_Status status;
    long numelem = world->numpop2 + (world->options->gamma ? 1 : 0);
    long numelem2 = 2 * numelem;
#ifdef BIGMEMORY

    long *workload;
    long workerloci;
    long offset;
    long i;
    workload = (long*) calloc(world->loci*2+2,sizeof(long));
    //printf("%i> mpi_result_master()\n",myID);
#endif

    temp = (double *) calloc (numelem2, sizeof (double));
    buffer = (char *) calloc (bufsize, sizeof (char));
    temp[0] = sendtype;
    temp[1] = bufsize;
    for (locus = 1; locus < MIN (world->loci + 1, numcpu); locus++)
    {
        MYMPISEND (temp, numelem2, MPI_DOUBLE, locus, locus, comm_world);
    }
    z = 0;
    while (!done)
    {
        if (z++ >= world->loci)
        {
            break;
        }
#ifdef BIGMEMORY
        MYMPIRECV (workload, 2*world->loci+2, MPI_LONG, MPI_ANY_SOURCE, MPI_ANY_TAG,
                  comm_world, &status);
        bufsize = workload[2*world->loci] ;
        workerloci = workload[2*world->loci+1];
        buffer = (char *) realloc (buffer, sizeof (char) * (bufsize+ 1));
        memset (buffer, 0, sizeof (char) * (bufsize + 1));
        sender = status.MPI_SOURCE;
        tag = status.MPI_TAG;
        MYMPIRECV (buffer, bufsize, MPI_CHAR, sender, tag, comm_world, &status);
        sbuffer = buffer;
        for(i=0; i<workerloci;i++)
        {
            locus = (int) workload[2*i];
            offset = workload[2*i+1];
            //printf("%i> locus=%i offset=%li\n",myID,locus,offset);
            (*unpack) (buffer+offset, world, locus, maxreplicate, numpop);
        }
        z += workerloci-1; //the test condition will add another one
#else

        MYMPIRECV (&bufsize, 1, MPI_LONG, MPI_ANY_SOURCE, MPI_ANY_TAG,
                  comm_world, &status);
        buffer = (char *) realloc (buffer, sizeof (char) * (bufsize + 1));
        memset (buffer, 0, sizeof (char) * (bufsize + 1));
        sender = status.MPI_SOURCE;
        tag = status.MPI_TAG;
        MYMPIRECV (buffer, bufsize, MPI_CHAR, sender, tag, comm_world, &status);
        sbuffer = buffer;
        (*unpack) (buffer, world, tag - 1, maxreplicate, numpop);
#endif

    }
    free (sbuffer);
    free (temp);
}

void
mpi_results_worker (long bufs, world_fmt * world, long maxrep,
                    long (*pack) (char **buffer, world_fmt * world,
                                  long locus, long maxrep, long numpop))
{
    long numpop = world->numpop;
    int ww, locus;
    char *allbuffer;
    //char **buf;
    long bufsize = locidone * 20; //enough space to accomodate the locus vector
#ifdef BIGMEMORY

    char *buffer;
    long allbufsize=0;
    long *workload;
    long zz=0;
#endif

    allbuffer = (char *) calloc (bufsize, sizeof (char)); //some small value
    //buf = &buffer;
#ifdef BIGMEMORY

    //printf("%i> mpi_result_worker()\n",myID);
    fflush(stdout);
    bufsize = 0;
    workload = (long *) calloc(2+world->loci*2, sizeof(long));//{locus1,strlen1,locus2,strlen2,.....,totalstrlen,n_loci}
    for (ww = 0; ww < locidone; ww++)
    {
        buffer = (char *) calloc (bufsize+1, sizeof (char));//some small value
        locus = world->who[ww];
        bufsize = (*pack) (&buffer, world, locus, maxrep, numpop);
        bufsize = strlen(buffer);
        workload[zz++] = (long) locus;
        workload[zz++] = allbufsize;
        allbufsize += bufsize;
        //printf("%i> locus=%li size=%li\n",myID,workload[zz-2],workload[zz-1]);
        allbuffer = (char *) realloc (allbuffer, sizeof (char)*allbufsize+1);
        strcat(allbuffer,buffer);
        free(buffer);
    }
    workload[2*world->loci]=strlen(allbuffer)+1;
    workload[2*world->loci+1]=(long)locidone;
    MYMPISEND (workload, 2+world->loci*2, MPI_LONG, MASTER, myID, comm_world);
    MYMPISEND (allbuffer, workload[2*world->loci], MPI_CHAR, MASTER, myID, comm_world);
    free(workload);
#else

    for (ww = 0; ww < locidone; ww++)
    {
        allbuffer = (char *) realloc (allbuffer, sizeof (char)); //some small value
        //    buf = &allbuffer;
        locus = world->who[ww];
        bufsize = (*pack) (&allbuffer, world, locus, maxrep, numpop);
        MYMPISEND (&bufsize, 1, MPI_LONG, MASTER, locus + 1, comm_world);
        MYMPISEND (allbuffer, bufsize, MPI_CHAR, MASTER, locus + 1, comm_world);
    }

#endif /*BIGMEMORY*/

}

void
mpi_broadcast_results (world_fmt * world, long loci,
                       long (*pack) (char **buffer, world_fmt * world,
                                     long locus, long maxrep, long numpop),
                       void (*unpack) (char *buffer, world_fmt * world,
                                       long locus, long maxrep, long numpop))
{
    long locus;
    // long addon = (world->loci == 1) 0 : 1;
    long bufsize=1;
    char nowstr[STRSIZE];
    char *allbuffer;// = &world->buffer;
#ifdef BIGMEMORY

    long allbufsize=0;
    char *buffer;
    long *workload;
    long workerloci=0;
    long i;
    long offset=0;
    long zz=0;
#endif

    long maxreplicate = (world->options->replicate
                         && world->options->replicatenum >
                         0) ? world->options->replicatenum : 1;
    allbuffer = (char *) calloc (10, sizeof (char));
#ifdef BIGMEMORY

    workload = (long *) calloc( 2 * (loci) + 2, sizeof(long));
    //  free(allbuffer);
    //  allbuffer = (char *) calloc (10, sizeof (char));
#endif

    get_time (nowstr, "%H:%M:%S");
    //if(world->options->progress)
    // printf("%i> Redistributing the data\nResult parts [Time is %s]\n",myID, nowstr);
#ifdef BIGMEMORY

    if (myID == MASTER)
    {
        for (locus = 0; locus < loci; locus++)
        {
            buffer = (char *) calloc (1, sizeof (char));
            bufsize = (*pack)(&buffer, world, locus, maxreplicate,
                              world->numpop);
            bufsize = strlen(buffer);
            workload[zz++] = (long) locus;
            workload[zz++] = allbufsize;
            allbufsize += bufsize;
            //printf("%i> result locus=%li size=%li\n",myID,workload[zz-2],workload[zz-1]);
            allbuffer = (char *) realloc ( allbuffer, sizeof (char)*(allbufsize+1));
            strcat(allbuffer,buffer);
            free(buffer);
        }
        workload[2*loci]=allbufsize;
        workload[2*loci+1]=(long)loci;
        // MYMPIBARRIER(comm_world);
        MYMPIBCAST (workload, 2 * (loci) + 2, MPI_LONG, MASTER, comm_world);
        MYMPIBCAST (allbuffer, allbufsize+1, MPI_CHAR, MASTER, comm_world);
        //printf("%i> Locus %li results sent\n",myID, locus);
    }
    else
    { // worker node

        // MYMPIBARRIER(comm_world);
        MYMPIBCAST (workload, 2*(loci)+2, MPI_LONG, MASTER, comm_world);
        bufsize =  workload[2*loci] ;
        workerloci = workload[2*loci+1];
        allbuffer = (char *) realloc (allbuffer,
                                      sizeof (char) * (bufsize + 1));
        memset (allbuffer, 0, sizeof (char) * (bufsize + 1));
        MYMPIBCAST (allbuffer, bufsize + 1, MPI_CHAR, MASTER, comm_world);
        for(i=0; i<workerloci; i++)
        {
            locus = (int) workload[2*i];
            offset = workload[2*i+1];
            (*unpack)(allbuffer + offset, world, locus, maxreplicate,
                      world->numpop);
        }
        // printf("%i> Loci results received\n",myID);
    }
#else
    for (locus = 0; locus < loci; locus++)
    {
        if (myID == MASTER)
        {
            bufsize =(*pack) (&allbuffer, world, locus, maxreplicate,
                              world->numpop);
            MYMPIBCAST (&bufsize, 1, MPI_LONG, MASTER, comm_world);
            MYMPIBCAST (allbuffer, bufsize, MPI_CHAR, MASTER, comm_world);
            // printf("%i> Locus %li results sent\n",myID, locus);
        }
        else
        {
            MYMPIBCAST (&bufsize, 1, MPI_LONG, MASTER, comm_world);
            allbuffer = (char *) realloc (allbuffer, sizeof (char) * bufsize + 1);
            MYMPIBCAST (allbuffer, bufsize, MPI_CHAR, MASTER, comm_world);
            (*unpack)(allbuffer, world, locus, maxreplicate,
                      world->numpop);
            //printf("%i> Locus %li results received\n",myID, locus);
        }
        memset (allbuffer, 0, sizeof (char) * bufsize);
    }
#endif
    free(allbuffer);
}

// slownet profiler
//
#ifdef SLOWNET

void
mpi_profiles_master (world_fmt * world, long nparam, int *profilewho)
{
    long pnum;
    int sender=0;
    //  int i;
    long pdone;
    boolean done;
    MPI_Status status;
    long numsent = 0;
    long bufsize;
    long quantsize;
    char *tempstr;
    long tempstrsize=MAXBUFSIZE;
    long temp[3];
    char **buffer = &world->buffer;
    FILE *outfile = world->outfile;

    tempstr = (char*) calloc(MAXBUFSIZE,sizeof(char));

    for (pnum = 0; pnum < MIN (nparam, numcpu - 1); pnum++)
    {
        MYMPISEND (&pnum, 1, MPI_LONG, pnum + 1, pnum + 1, comm_world);
        numsent++;
    }
    for (pnum = 0; pnum < nparam; pnum++)
    {
        done = FALSE;
        while(!done)
        {
            MYMPIRECV (tempstr, SMALLBUFSIZE, MPI_CHAR, MPI_ANY_SOURCE, MPI_ANY_TAG, comm_world, &status);
            sender = status.MPI_SOURCE;
            switch(tempstr[0])
            {
            case 'M':
                 tempstrsize = atol(tempstr+1);
                MYMPIRECV (tempstr, tempstrsize, MPI_CHAR, sender, MPI_ANY_TAG,
                           comm_world, &status);                
                handle_message(tempstr,sender, world);
                break;
            case 'P':
                MYMPIRECV (temp, 3, MPI_LONG, sender, MPI_ANY_TAG,
                           comm_world, &status);
                pdone = temp[0];
                bufsize = temp[1];
                quantsize = temp[2];
                profilewho[pdone] = sender;
                *buffer =
                    (char *) realloc (*buffer, sizeof (char) * (bufsize + quantsize + 1));
                memset (*buffer, 0, sizeof (char) * (bufsize + quantsize + 1));
                MYMPIRECV (*buffer, bufsize + quantsize, MPI_CHAR, sender, MPI_ANY_TAG,
                          comm_world, &status);
                if(world->options->printprofsummary)
                {
                    unpack_quantile ((*buffer) + bufsize, world->quantiles[pdone],
                                     GRIDSIZE);
                    memset ((*buffer) + bufsize, 0, sizeof (char) * quantsize);
                }
                fprintf (outfile, "%s\n\n", *buffer);
                done=TRUE;
                break;
            default:
                error("DIED because of wrong message from worker");
                break;
            }
        }
        if (numsent < nparam)
        {
            MYMPISEND (&numsent, 1, MPI_LONG, sender, numsent + 1, comm_world);
            numsent++;
        }
        else
        {
            // stop worker because there is nothing to do anymore
            MYMPISEND (&nparam, 1, MPI_LONG, sender, 0, comm_world); //end of parameter list
        }
    }
    // stop workers that did nothing for profiles
    for (sender = MIN (nparam, numcpu - 1) + 1; sender < numcpu ; sender++)
    {
        // stop all nodes to wait for profiles
        MYMPISEND (&nparam, 1, MPI_LONG, sender, 0, comm_world); 
    }
}

void
mpi_profiles_worker (world_fmt * world, long *gmaxptr)
{
    boolean done = FALSE;
    long pnum;
    long temp[3];
    char *tempstr;
    long tempstrsize;
    char *quantilebuffer;
    MPI_Status status;
    long qbufsize = 1;
    quantilebuffer = (char *) calloc (qbufsize, sizeof (char));
    tempstr = (char *) calloc (MAXBUFSIZE, sizeof (char));
    while (!done)
    {
        MYMPIRECV (&pnum, 1, MPI_LONG, MASTER, MPI_ANY_TAG, comm_world, &status);
        if (status.MPI_TAG != 0) //stop condition
        {
            // fills world->buffer with profile information
            print_profile_likelihood_driver (pnum, world, gmaxptr);
            temp[0] = pnum;
            temp[1] = strlen (world->buffer);
            if(world->options->printprofsummary)
            {
                temp[2] = pack_quantile (&quantilebuffer, world->quantiles[pnum], GRIDSIZE);
                world->buffer =     (char *) realloc (world->buffer,
                                                      sizeof (char) * (temp[1] + temp[2] + 1));
                strcat (world->buffer, quantilebuffer);
            }
            else
                temp[2] = 0;
            tempstrsize = 1 + sprintf(tempstr,"P%li", temp[1]);
            tempstr[0]='P';
            MYMPISEND (tempstr, SMALLBUFSIZE , MPI_CHAR, MASTER, pnum + 1, comm_world);
            MYMPISEND (temp, 3 , MPI_LONG, MASTER, pnum + 1, comm_world);
            MYMPISEND (world->buffer, temp[1] + temp[2], MPI_CHAR, MASTER,
                      pnum + 1, comm_world);
            world->profilewho[profiledone++] = pnum;
        }
        else
        {
            done = TRUE;
        }
    }
    free(tempstr);
    free (quantilebuffer);
}

long
pack_quantile (char **buffer, quantile_fmt quant, long n)
{
    long i;
    char fp[LONGLINESIZE];
    long bufsize = LONGLINESIZE;
    *buffer = (char *) realloc (*buffer, sizeof (char) * bufsize);
    sprintf (*buffer, "QUANTILEBUFFER:\n %s\n", quant.name);
    for (i = 0; i < n; i++)
    {
        bufsize += 1 + sprintf (fp, "%20.20f\n", quant.param[i]);
        *buffer = (char *) realloc (*buffer, sizeof (char) * bufsize);
        strcat (*buffer, fp);
    }
    return bufsize;
}

void
unpack_quantile (char *buffer, quantile_fmt quant, long n)
{
    long i;
    char *input;
    char *buf = buffer;
    input = (char*) calloc(LONGLINESIZE,sizeof(char));
    sgets (input, LONGLINESIZE, &buf);
    sgets (input, LONGLINESIZE, &buf);
    strcpy (quant.name, input);
    for (i = 0; i < n; i++)
    {
        sgets (input, LONGLINESIZE, &buf);
        quant.param[i] = atof (input);
    }
    free(input);
}

#endif

/*
// send the data over all loci/replicates to all nodes
// including the master node, so that all nodes can then 
// start calculating profiles [see calc_profiles()]
//
void distribute_locidata(world_fmt *world)
{
  char *buffer;
  pack_loci_data(world, &buffer);
  MPI_allgather(buffer);
  unpack_loci_data(buffer, world);
  free(buffer);
}
 
void pack_loci_data(world_fmt *world, char **buffer)
{
  long replicates = world->options->repl
  *buffer = realloc(*buffer,LONGLINESIZE);
  hits = sscanf (input, "%li %li %li %li %li", &world->loci, &world->numpop, &world->numpop2, &tmp, &replicates);  
}
*/
// necessary for analyzing old sumfiles using MPI
//
// master is reusing  mpi_runloci_master()
void
assignloci_worker (world_fmt * world)
{
    boolean done = FALSE;
    long locus;
    MPI_Status status;
    while (!done)
    {
        MYMPIRECV (&locus, 1, MPI_LONG, MASTER, MPI_ANY_TAG,
                  comm_world, &status);
        if (status.MPI_TAG != 0) //stop condition
        {
            swap_atl (locus, locidone, world);

            MYMPISEND (&locus, 1, MPI_LONG, MASTER, locus + 1, comm_world);
            /* we want to know what locus we worked for
               - to control the work sent by master
               - to use in setup_parameter0() [combroyden2.c] */
            world->who[locidone++] = locus;
        }
        else
        {
            done = TRUE;
        }
    }
}

void
swap_atl (long from, long to, world_fmt * world)
{
    long r;
    timearchive_fmt *tmp;
    for (r = 0; r < world->options->replicatenum; r++)
    {
        tmp = &world->atl[r][to];
        world->atl[r][to] = world->atl[r][from];
        world->atl[r][from] = *tmp;
    }
}


#ifdef SLOWNET
void
setup_parameter0_slowmpi (world_fmt * world, nr_fmt * nr, long repkind,
                          long repstart, long repstop, long loci, long kind,
                          boolean multilocus)
{
    long locus, r;
    if (myID != MASTER)
    {
        if (multilocus)
        {
            for (locus = 0; locus < loci; locus++)
            {
                if (repkind == SINGLECHAIN)
                {
                    for (r = repstart; r < repstop; r++)
                        create_apg0 (nr->apg0[r][locus], nr,
                                     &world->atl[r][locus], locus);
                }
                else
                {
                    if (kind != PROFILE)
                    {
                        for (r = repstart; r < repstop; r++)
                            create_apg0 (nr->apg0[r][locus], nr,
                                         &world->atl[r][locus], locus);
                        interpolate_like (nr, locus);
                    }
                    else
                    {
                        for (r = repstart; r < repstop; r++)
                            create_multiapg0 (nr->apg0[r][locus], nr, r, locus);
                    }
                }
            }
        }
        else   //single locus
        {
            if (repkind == SINGLECHAIN)
            {
                for (r = repstart; r < repstop; r++)
                    create_apg0 (nr->apg0[r][world->locus], nr,
                                 &world->atl[r][world->locus], world->locus);
            }
            else
            {
                if (kind != PROFILE)
                {
                    for (r = repstart; r < repstop; r++)
                        create_apg0 (nr->apg0[r][world->locus], nr,
                                     &world->atl[r][world->locus], world->locus);
                    interpolate_like (nr, world->locus);
                }
                for (r = repstart; r < repstop; r++)
                    create_multiapg0 (nr->apg0[r][world->locus], nr, r,
                                      world->locus);
            }
        }
    }
}
#endif

void
setup_parameter0_mpi (world_fmt * world, nr_fmt * nr, long repkind,
                      long repstart, long repstop, long loci, long kind,
                      boolean multilocus)
{
    long locus, r;
    long ll;
    if (myID != MASTER)
    {
        if (multilocus)
        {
            for (ll = 0; ll < locidone; ll++)
            {
                locus = world->locus = world->who[ll];
                if (repkind == SINGLECHAIN)
                {
                    for (r = repstart; r < repstop; r++)
                        create_apg0 (nr->apg0[r][locus], nr,
                                     &world->atl[r][locus], locus);
                }
                else
                {
                    if (kind != PROFILE)
                    {
                        for (r = repstart; r < repstop; r++)
                            create_apg0 (nr->apg0[r][locus], nr,
                                         &world->atl[r][locus], locus);
                        interpolate_like (nr, locus);
                    }
                    else
                    {
                        for (r = repstart; r < repstop; r++)
                            create_multiapg0 (nr->apg0[r][locus], nr, r, locus);
                    }
                }
            }
        }
        else   //single locus
        {
            if (repkind == SINGLECHAIN)
            {
                for (r = repstart; r < repstop; r++)
                    create_apg0 (nr->apg0[r][world->locus], nr,
                                 &world->atl[r][world->locus], world->locus);
            }
            else
            {
                if (kind != PROFILE)
                {
                    for (r = repstart; r < repstop; r++)
                        create_apg0 (nr->apg0[r][world->locus], nr,
                                     &world->atl[r][world->locus], world->locus);
                    interpolate_like (nr, world->locus);
                }
                for (r = repstart; r < repstop; r++)
                    create_multiapg0 (nr->apg0[r][world->locus], nr, r,
                                      world->locus);
            }
        }
    }
}


void handle_message(char *rawmessage,int sender, world_fmt * world)
{
    char *rawptr;
    long  pos=0;
    FILE *file = stdout;
    rawptr = rawmessage;
    set_filehandle(rawmessage, world, &file, &pos);
    fprintf(file,"%s", rawmessage + pos);
    fflush(file);
}

// needs globals filedb, and filenum
void setup_filehandle_db(FILE *file, world_fmt *world, option_fmt *options, data_fmt *data)
{
    long filehandle = get_filehandle(file, world, options, data);
    filedb[filenum].file = file;
    filedb[filenum++].handle = filehandle;
  //  fprintf(stdout,"filedb %li: %p %li\n",filenum, file,filehandle);
}

long retrieve_filehandle(FILE *file)
{
    long i=0;
    long filehandle = 0;
    while(filedb[i].file != file && i<filenum)
        i++;
    if(i!=filenum)
        filehandle = filedb[i].handle;
    return filehandle;
}

long get_filehandle(FILE *file, world_fmt *world, option_fmt *options, data_fmt *data)
{
    if(file == stdout)
        return STDOUTNUM;
    if(file == options->logfile)
        return LOGFILENUM;
    if(file == world->outfile)
        return OUTFILENUM;
    if(file == options->aicfile)
        return AICFILENUM;
    if(file == world->mathfile)
        return MATHFILENUM;
#ifdef BAYESUPDATE	
    if(file == world->bayesfile)
        return BAYESFILENUM;
#endif
    return STDOUTNUM;
}

long get_filehandle2(FILE *file, world_fmt *world)
{
    if(file == stdout)
        return STDOUTNUM;
    if(file == world->options->logfile)
        return LOGFILENUM;
    if(file == world->outfile)
        return OUTFILENUM;
    if(file == world->mighistfile)
        return MIGHISTFILENUM;
    if(file == world->options->aicfile)
        return AICFILENUM;
    if(file == world->mathfile)
        return MATHFILENUM;
#ifdef BAYESUPDATE
    if(file == world->bayesfile)
        return BAYESFILENUM;
#endif
 //   fprintf(stdout,"@@@@@@@@@@@@@@@wrong wrong wrong@@@@@@@@@@@@@@@@\n");
    return STDOUTNUM;
}


void set_filehandle(char *message, world_fmt *world,
                    FILE **file, long *msgstart)
{
    long filepos=0;
    static char *temp;
    long filenum;
    temp = calloc(100,sizeof(char));
    filepos = strcspn(message,":") + 1;
    *msgstart = filepos;
    strncpy(temp,message,filepos);
    filenum = atol(temp+1);
    //fprintf(stdout,"\n@@@@@@@@@@@@@@@@@%li@%s@%li@\n",filenum,temp,filepos);
    if(filenum==STDOUTNUM)
    {
        //		fprintf(stdout,"\n");
        *file = stdout;
        return;
    }
    if(filenum == LOGFILENUM)
    {
        //	fprintf(stdout," logfile\n");
        *file = world->options->logfile;
        return;
    }
    if(filenum == OUTFILENUM)
    {
        *file = world->outfile;
        return ;
    }
    *file = stdout;
    return;
}

void
mpi_fprintf(FILE *file, const char *fmt, ...)
{
    char p1[SMALLBUFSIZE];
    char *p;
    va_list ap;
    long filehandle = 0;
    long bufsize = 0;
    p = calloc(MAXBUFSIZE,sizeof(char));
    if(myID!=MASTER)
    {
        filehandle = retrieve_filehandle(file);
        bufsize += sprintf(p, "%c%li:",'M',filehandle);
    }
    va_start(ap, fmt);
    bufsize += 1+ vsprintf(p+bufsize, fmt, ap);
    if(myID!=MASTER)
    {
        bufsize= 1+strlen(p);
        sprintf(p1,"M%li",bufsize);
        MYMPISEND (p1, SMALLBUFSIZE, MPI_CHAR, MASTER, myID, comm_world);
        MYMPISEND (p, bufsize, MPI_CHAR, MASTER, myID, comm_world);
    }
    else
        fprintf(file,"%s", p);
    va_end(ap);
   // fprintf(stderr,"%s\n", p);
    free(p);
}

#endif
