/*! \file migrate_mpi.c */
/* MPI parts for migrate
   started November 2000, Seattle
   Peter Beerli beerli@csit.fsu.edu
 
   
Copyright 1996-2002 Peter Beerli and Joseph Felsenstein, Seattle WA
Copyright 2003-2004 Peter Beerli, Tallahassee FL
 
 This software is distributed free of charge for non-commercial use
 and is copyrighted. Of course, we do not guarantee that the software
 works and are not responsible for any damage you may cause or have.
 
 
$Id: migrate_mpi.c 175 2005-12-02 14:51:23Z beerli $
*/
#ifdef MPI
#include "migration.h"
#include "tools.h"
#include "sighandler.h"
#include "migrate_mpi.h"
#include "broyden.h"
#include "combroyden.h"
#include "gammalike.h"
#include "profile.h"
#include "options.h"
#include "tree.h"
#include "world.h"
#include "joint-chains.h"
#include "data.h"
#include "laguerre.h"
#ifdef UEP
#include "uep.h"
#endif
#include "bayes.h"
#define MYINT  int
/*should go into profile.h*/
#define GRIDSIZE 9
#define REPTAG 90
#define PRINTTAG 800
#define TEMPTAG 7000
extern void run_replicate(long locus,
                          long replicate,
                          world_fmt **universe,
                          option_fmt *options,
                          data_fmt *data, 
                          tpool_t * heating_pool,
                          int usize,
                          long *treefilepos,
                          long *Gmax);
extern void run_locus (world_fmt ** universe, int usize,
                       option_fmt * options, data_fmt * data,
                       tpool_t * heating_pool, long maxreplicate,
                       long locus, long *treefilepos, long *Gmax);

void mpi_run_locus(world_fmt ** universe, int usize, option_fmt * options,
                   data_fmt * data, tpool_t * heating_pool, long maxreplicate,
                   long locus, long *treefilepos, long *Gmax);
void mpi_runreplicates_worker (world_fmt ** universe, int usize,
                               option_fmt * options, data_fmt * data,
                               tpool_t * heating_pool,
                               long *treefilepos, long *Gmax);
long pack_databuffer (char **buffer, data_fmt * data, option_fmt * options);
void unpack_databuffer (char *buffer, data_fmt * data, option_fmt * options);
void pack_allele_data (char **buffer, long *bufsize, data_fmt * data,
                       long pop, long ind);
void pack_sequence_data (char **buffer, long *bufsize, data_fmt * data,
                         long pop, long ind, long locus);
void mpi_gradient_master (nr_fmt * nr, world_fmt * world, int *who);
void mpi_results_master (MYREAL sendtype, world_fmt * world,
                         long maxrep,
                         void (*unpack) (char *buffer, world_fmt * world,
                                         long locus, long maxrep,
                                         long numpop));

void mpi_results_worker (long bufs, world_fmt * world,
                         long maxrep,
                         long (*pack) (char **buffer, world_fmt * world,
                                       long locus, long maxrep, long numpop));
void assignloci_worker (world_fmt * world);
void swap_atl (long from, long to, world_fmt * world);
long pack_quantile (char **buffer, quantile_fmt quant, long n);
void unpack_quantile (char *buffer, quantile_fmt quant, long n);
long pack_failed_percentiles (char **buffer, boolean *failed, long n);
void unpack_failed_percentiles (char *buffer, boolean *failed, long n);

void handle_message(char *rawmessage,int sender, world_fmt *world);


void set_filehandle(char *message, world_fmt *world,
                    FILE **file, long *msgstart);

void mpi_receive_replicate( int sender, int tag, long locus, long replicate, world_fmt * world);

void unpack_single_bayes_buffer(char *buffer, bayes_fmt * bayes, world_fmt * world,long locus);
long pack_single_bayes_buffer(char **buffer, bayes_fmt *bayes, world_fmt *world,long locus);
long pack_single_bayes_buffer_part(char **buffer, bayes_fmt *bayes, world_fmt *world,long locus);

long pack_single_bayes_buffer(char **buffer, bayes_fmt *bayes, world_fmt *world,long locus);
void unpack_hist_bayes_buffer(char *buffer, bayes_fmt *bayes, world_fmt *world, long locus);
long pack_hist_bayes_buffer(char **buffer, bayes_fmt *bayes, world_fmt * world, long locus);

void unpack_sumfile_buffer (char *buffer, world_fmt * world,
                            long locus, long maxrep, long numpop);

void unpack_single_sumfile_buffer (char *input, char *buffer, timearchive_fmt **ta, world_fmt *world,
                                   long locus, long replicate, long numpop);

long
pack_sumfile_buffer (char **buffer, world_fmt * world,
                     long locus, long maxrep, long numpop);

long pack_single_sumfile_buffer(char **buffer, world_fmt * world,
                                long locus, long replicate, long numpop);

void mpi_send_replicate(int sender, long locus, long replicate, world_fmt * world);
long  mpi_send_stop_mcmc_lociworker(long numcpu, long loci);
long  mpi_send_stop_mcmc_replicateworker(long numcpu, long loci);

long  mpi_send_stop_mcmc_worker_orig(long numcpu, long loci, MPI_Comm *comm, MPI_Request *irequests, MPI_Status *istatus, long id);

void send_receive_bayes_params(world_fmt *world, long locus);


void
mpi_runloci_master (long loci, int *who, world_fmt *world)
{
    long locus;
    int sender = 0;
    int tag;
    long locusdone = -1;
    MPI_Status status;
    long numsent = 0;
    boolean done = FALSE;
    char *tempstr;
    long tempstrsize=MAXBUFSIZE;
    long nbase = loci + 1;
    long minnodes = MIN((long) numcpu-1, (long) nbase-1);
    long *twolongs;
    
    MPI_Request *irequests;
    MPI_Status *istatus;
    irequests = (MPI_Request *) mycalloc(minnodes,sizeof(MPI_Request));
    istatus = (MPI_Status *) mycalloc(minnodes,sizeof(MPI_Status));
    twolongs = (long *) calloc(TWO,sizeof(long));
    tempstr = (char *) mycalloc(MAXBUFSIZE,sizeof(char));
    twolongs[1] = 0;
    //    fprintf(stdout,"%i>>>>>>>>>>>>>>> will send minnodes  %li = MIN(%li,%li) to\n",myID, minnodes, (long) numcpu-1, (long) nbase-1);
    for (locus = 0; locus < minnodes; locus++)
    {
        twolongs[0] = locus;
	// fprintf(stdout,"%i>>>>>>>>>>>>>>> send locus %li to id %li\n",myID, locus, locus+1);
        MYMPIISEND (twolongs, TWO, MPI_LONG, (MYINT) locus + 1, (MYINT) locus + 1, comm_world, &irequests[numsent]);
        numsent++;
    }
    //fprintf(stdout,"isend loop done\n"); fflush(stdout);
    MYMPIWAITALL(minnodes,irequests, istatus);
    
    for (locus = 0; locus < loci; locus++)
    {
        done=FALSE;
        while(!done)
        {
            MYMPIRECV (tempstr, SMALLBUFSIZE, MPI_CHAR, (MYINT) MPI_ANY_SOURCE, (MYINT) MPI_ANY_TAG, comm_world, &status);
            sender = status.MPI_SOURCE;
            tag = status.MPI_TAG;
//            fprintf(stdout,"%i> @@@@%s@@@@\n",sender, rawmessage);
            switch(tempstr[0])
            {
            case 'M':
                tempstrsize = atol(tempstr+1);
                MYMPIRECV (tempstr, tempstrsize, MPI_CHAR, sender, tag,
                           comm_world, &status);
                handle_message(tempstr,sender, world);
                break;
            case 'R':
                //ignore first character and translate into locusnumber
                locusdone = atol(tempstr+1);
                //
                // insert bayes receive params code here?
                //
                done=TRUE;
                break;
            default:
                fprintf(stderr,"%i> message=%s\n%i> sender=%i tag=%i\n",myID,tempstr, myID,status.MPI_SOURCE,status.MPI_TAG);
                MPI_Finalize();
                error("DIED because of wrong message from worker");
                break;
            }
        }
        who[locusdone] = sender;
        if (numsent < loci)
        {
            twolongs[0]=numsent;
            MYMPISEND (twolongs, TWO, MPI_LONG, (MYINT) sender, (MYINT) numsent + 1, comm_world);
            numsent++;
        }
        else
        {
            twolongs[0] = 0;
            //fprintf(stdout,"%i> About to stop loci-node %i\n",myID, sender);
            MYMPISEND (twolongs, TWO, MPI_LONG, (MYINT) sender, (MYINT) 0, comm_world); //stop workers to wait for new loci
        }
    }
    //stop loci and/or replicate worker that had never the chance to work on a locus or replicate, but are still
    //listening, because I used two communicators I send a message to myID=1 that then is in charge to kill of the other nodes
    // the replicator-nodes expect a signal from comm_worker and not comm_world, the master is only member of the latter.
#ifdef MPIREPLICANT
    MYMPISEND (twolongs, TWO, MPI_LONG, FIRSTWORKER, 0, comm_world); //stop worker to wait for new loci
#endif
    free(twolongs);
    free(istatus);
    free(irequests);
}


void
mpi_runloci_worker (world_fmt ** universe, int usize,
                    option_fmt * options, data_fmt * data,
                    tpool_t * heating_pool, long maxreplicate,
                    long *treefilepos, long *Gmax)
{
    boolean done = FALSE;
    long locus;
    char *rawmessage;
    long rawmsgsize = 0;
#ifdef MPIREPLICANT    
    long nbase = data->loci+1;
#endif
    MPI_Status status;
    long *twolongs;
    
    twolongs = (long *) calloc(TWO, sizeof(long));
    rawmessage = (char *) mycalloc(STRSIZE,sizeof(char));
#ifdef MPIREPLICANT
    if(myID < nbase)
    {
#endif
        while (!done)
        {
	  //fprintf(stdout,"%i>>>>>>>> receive work\n",myID);
	    fflush(stdout);
            MYMPIRECV (twolongs, TWO, MPI_LONG, MASTER, MPI_ANY_TAG,
                      comm_world, &status);
	    // fprintf(stdout,"%i>>>>>>>> received work with status tag %i\n",myID, status.MPI_TAG);
	    fflush(stdout);
            locus = twolongs[0];
            if (status.MPI_TAG != 0) //stop condition
            {
#ifdef MPIREPLICANT
                mpi_run_locus(universe, usize, options, data, 
                              heating_pool, maxreplicate, locus, treefilepos, Gmax);  
#else
                run_locus (universe, usize, options, data,
                           heating_pool, maxreplicate, locus, treefilepos, Gmax);
#endif
                rawmsgsize = sprintf(rawmessage,"R%li",locus) + 1;
                MYMPISEND (rawmessage, rawmsgsize, MPI_CHAR, (MYINT) MASTER, (MYINT) (locus + ONE), comm_world);
                /* we want to know what locus we worked for
                   - to control the work sent by master
                   - to use in setup_parameter0() [combroyden2.c] */
                universe[0]->who[locidone++] = locus;
            }
            else
            {
	      //fprintf(stdout,"%i>>>>>>>> terminate locus-work\n",myID);
                done = TRUE;
            }
        }
#ifdef MPIREPLICANT
    }
    else
    {
      //fprintf(stdout,"%i> will execute mpi_runreplicates_worker()\n",myID);
        mpi_runreplicates_worker (universe, usize, options,  data, heating_pool, treefilepos, Gmax);
    }        
#endif
    free(twolongs);
    free(rawmessage);
}

#ifdef MPIREPLICANT
///
/// attempt to improve MPIruns with replicates
/// each locus is responsible for replication farming and and reporting back to master
/// master <- locus-master <- locus-replicate-worker
/// generate genealogies for a single locus
/// \callgraph
void
mpi_run_locus(world_fmt ** universe, int usize, option_fmt * options,
          data_fmt * data, tpool_t * heating_pool, long maxreplicate,
          long locus, long *treefilepos, long *Gmax)
{
    int sender = 0;
    int tag;
//    int receiver = 99;
    int *who;
    int minnodes; // number of replicate-worker nodes
    int nbase = data->loci; //number of loci-worker nodes
    int senderlocus = -1;
    long senderreplicate=0;
    long replicate;
    long numsent=0;
    long i;
//    int which;
//    long ii;
//    int error;
    
    boolean done=FALSE;
//    boolean repdone = FALSE;
    
    char *tempstr;
    long *temp;
    
//    int *repworkers;
    MPI_Request *irequests = NULL;
    MPI_Request irequest;
    MPI_Status *istatus=NULL;
    MPI_Status status;
    
    temp = (long *) calloc(TWO, sizeof(long));
    tempstr = (char *) mycalloc(MAXBUFSIZE,sizeof(char));
    who = (int *) mycalloc(maxreplicate,sizeof(int));

    
    if(maxreplicate>1)
    {
        // number of nodes available for replicate workers,
        // numcpu - nbase - 1 [for masternode] and maxreplicate-1 [locus-worker is doing one replicate itself]  are limiting
        minnodes =  MAX(0,MIN(maxreplicate-1,numcpu-nbase-1))+1;
        //repworkers = (int *) mycalloc(minnodes,sizeof(int));
        //fprintf(stdout,"%i> ################ numcpu=%i nbase=%i minnodes=%i \n",myID, numcpu, nbase, minnodes);
        irequests = (MPI_Request *) mycalloc(minnodes+1,sizeof(MPI_Request));
        istatus   = (MPI_Status *) mycalloc(minnodes+1,sizeof(MPI_Status));
        temp[0] = locus;
        // replicate 1 to maxreplicate should be worked on by other nodes
        // minnodes is the number of nodes free for work on replicates alone
        // so with a 3 replicates to work on and 3 total nodes and a single locus we need
        // 1 master, 1 locus-worker and have 1 replicate-worker, the locus-worker and the replicate worker will
        // both work on replicates, the locus-worker sends off 1 request (for the replicate worker)
        // because we start the locus worker with replicate 0, the loop for the replicate workers starts at one
        // and to address this shift the MIN(..,minnodes+1) addresses that
        for (replicate = 1; replicate < minnodes; replicate++)
        {
            temp[1] = replicate;
            //fprintf(stdout,"%i> 1111111111 before send to replicates %li (node=%li)\n",myID, replicate, replicate+nbase-1);
            MYMPIISEND (temp, TWO, MPI_LONG, (MYINT) (replicate + nbase-1), (MYINT) (locus+1 + REPTAG), comm_workers, &irequests[numsent]);
            numsent++;   // counter of how many replicates are sent off-node
        }
        // this should be asyncronous so that the myRepID-master can do some work, too.
        //fprintf(stdout,"%i> ########## a total of %li requests were sent out\n",myID, numsent);
        run_replicate(locus, 0, universe, options, data, 
                      heating_pool, usize,
                      treefilepos, Gmax);
        //fprintf(stdout,"%i> ================== finished replicate zero\n",myID);
        who[0] = myID;
        MYMPIWAITALL(numsent,irequests, istatus); // wait for all replicators to finish
        replicate=1;   // replicate counter set to 1 because locus-worker itself finished first replicate
        numsent++;
        
        done = FALSE;
        while(!done)
        {
            // done=FALSE;
            // no replicator worker available
            // the loci-worker has to do all the work
            if(numsent==1)
            {
                run_replicate(locus, replicate, universe, options, data, 
                              heating_pool, usize,
                              treefilepos, Gmax);
                who[replicate] = myID;
                replicate++;
                if(replicate >= maxreplicate)
                    done=TRUE;
            }            
            else
            {
                memset(irequests,0,sizeof(int)*minnodes);
                memset(istatus,0,sizeof(int)*minnodes);
                MYMPIRECV (tempstr, SMALLBUFSIZE, MPI_CHAR, MPI_ANY_SOURCE, (MYINT)(locus+1+ REPTAG), comm_workers, &status);
                sender = status.MPI_SOURCE;  // repID of the replicator that did the work
                tag = status.MPI_TAG;        // tag is working locus + replicator tag 
                senderlocus = tag-REPTAG;    // locus that was worked on by sender  
                // test so that we can be sure that we got things from a valid replicator worker
                if(sender == myRepID)
                {
                    fprintf(stdout,"%i, %i> DIE\nDIE\nDIE\nDIE\nDIE\nDIE\nDIE\nDIE\nDIE\n",myID, myRepID);
                    error("tried to send a replicate to myself using MPI -- this is not allowed\n");
                }
                // test whether the locus is the same between locus-worker and replicator
                if(senderlocus-1 != locus)
                    warning("%i> !!!!!!!!!!!!! got wrong locus from worker myRepID=%i (my locus %i != its locus %i )\n",
                          myID, sender, locus, senderlocus-1);
                // receive only messages that are prefixed with 'R' and exit on all others
                if(tempstr[0]=='R')
                {
                    //ignore first character and translate into repnumber
                    senderreplicate = atol(tempstr+1);
                }
                else
                {
		  fprintf(stderr,"%i> message=%s\n%i> sender=%i tag=%i\n",myID,tempstr, myID,status.MPI_SOURCE,status.MPI_TAG);
                    error("DIED because of wrong message from worker");
                }
                who[senderreplicate] = sender; // record sender , this record should be filled at the end of this function
                mpi_receive_replicate(sender, tag, locus, senderreplicate, universe[0]); // receive replicate data
                replicate++;   
                if(replicate >= maxreplicate)
                {
                    done=TRUE;
                }
                temp[0] = locus;
                if (numsent < maxreplicate) //at least one set was worked by the locus-worker
                {
                    temp[1] = numsent;
                //    fprintf(stdout,"%i> 2222222222222222 before send to replicates %li (node=%i)\n",myID, numsent, sender+1);
                    MYMPIISEND (temp, TWO, MPI_LONG, (MYINT) sender, (MYINT) (locus+1+ REPTAG), comm_workers, &irequest);
                    numsent++;
               /**DEBUGTEST     if(numsent < maxreplicate)
                    {
                        run_replicate(locus, numsent, universe, options, data, 
                                      heating_pool, usize,
                                      treefilepos, Gmax);
                     //   fprintf(stdout,"%i> ==2================ finished replicate %li\n",myID,numsent);
                        who[numsent] = myID;
                        numsent++;
                        replicate++;
                    } *****/
                    MYMPIWAITALL(ONE, &irequest, &status);
                }
            }
        }
    }
    else
    { // no replicates
        run_replicate(locus, 0, universe, options, data, 
                      heating_pool, usize,
                      treefilepos, Gmax);
    }    
    free(who);
    free(temp);
    free(tempstr);
    free(irequests);
    free(istatus);
#ifdef UEP
    if (options->uep)
        show_uep_store(universe[0]);
#endif
    if (options->replicate && options->replicatenum > 0)
    {
        (universe[0])->repkind = MULTIPLERUN;
        if (options->bayes_infer)
        {
            calculate_credibility_interval(universe[0], locus);
            return;
        }
#ifdef LONGSUM
        change_longsum_times(EARTH);
#endif /*LONGSUM*/        
        // over multiple replicates if present or single locus
        (void) estimateParameter(options->replicatenum, *Gmax, universe[0], options,
                                 (universe[0])->cov[locus], options->lchains, /*type*/ 'l',
                                 SINGLELOCUS, (universe[0])->repkind);
    }
    else
    {
        if (options->bayes_infer)
        {
            calculate_credibility_interval(universe[0], locus);
        }
    }        
    //=======
    if (options->heating)
    {
        for (i = 0; i < options->heated_chains; i++)
        {
            free_tree(universe[i]->root, universe[i]);
            //   free_timevector(universe[i]->treetimes);
        }
    }
    else
    {
        free_tree(universe[0]->root, universe[0]);        
    }
    //=======
    bayes_reset(universe[0]);
    
}


///
/// run replicates on replicate-worker nodes
/// this function is called in main() and will work on any locus and any replicate
/// the calling function is responsible for the correct assignment to locus and replicate.
void
mpi_runreplicates_worker (world_fmt ** universe, int usize,
                    option_fmt * options, data_fmt * data,
                    tpool_t * heating_pool, 
                    long *treefilepos, long *Gmax)
{
    boolean done = FALSE;
    int sender;
    long replicate;
    long locus;
    char *rawmessage;
    long rawmsgsize = 0;
    
    MPI_Status status;
    long *temp;
    temp = (long *) calloc(TWO, sizeof(long));
    rawmessage = (char *) mycalloc(STRSIZE,sizeof(char));
    while (!done)
      {
        //fprintf(stdout,"%i> wait for replicate work #######################################\n",myID);fflush(stdout);
        MYMPIRECV (temp, 2, MPI_LONG, MPI_ANY_SOURCE, MPI_ANY_TAG,
                   comm_workers, &status);
        sender = status.MPI_SOURCE;
        locus = temp[0];
        replicate = temp[1];
	//fprintf(stdout,"%i> sender=%i, locus=%li, replicate=%li\n",myID,sender,locus,replicate);
        if (status.MPI_TAG != 0) //stop condition
          {
            //fprintf(stdout,"%i> # received replicate work or end\n%i> # replicate=%li locus=%li\n",
            //        myID, myID,replicate,locus);
            run_replicate(locus, replicate, universe, options, data, heating_pool, usize,treefilepos, Gmax);
            rawmsgsize = sprintf(rawmessage,"R%li ",replicate);
            // fprintf(stdout,"%i> will send # replicate=%li locus=%li temp=%s\n####################### tag=%i\n",
            //        myID,replicate,locus, rawmessage, status.MPI_TAG);
        
            MYMPISEND (rawmessage, rawmsgsize, MPI_CHAR, (MYINT) sender, (MYINT) (locus+1+ REPTAG), comm_workers);
            mpi_send_replicate(sender, locus, replicate, universe[0]);
          }
        else
          {
            done = TRUE;
            //fprintf(stdout,"%i> # received replicate stop \n",
            //        myID);
          }
      }
    free(temp);
}

//---------end replication in MPI
#endif /*MPIREPLICANT*/

MYREAL
mpi_likelihood_master (MYREAL *param, MYREAL *lparam,
                       world_fmt * world, nr_fmt * nr,
                       helper_fmt * helper, int *who)
{
    long locus, worker;
    long addon=1;
    long sender;
    MPI_Status status;
    MYREAL logres = 0.0;
    MYREAL *temp;
    MYREAL *tmp;
    int tag;

    long numelem = world->numpop2 + (world->options->gamma ? 1 : 0);
    long numelem2 = numelem * 2;
    
    doublevec1d(&tmp,world->loci);
    doublevec1d(&temp,numelem2+2);

    temp[0] = MIGMPI_LIKE;
    memcpy (temp + 1, param, numelem * sizeof (MYREAL));
    memcpy (temp + 1 + numelem, lparam, numelem * sizeof (MYREAL));
    
    //memset (nr->locilikes, 0, sizeof (MYREAL) * world->loci);
    for(locus=0; locus < world->loci; locus++)
        nr->locilikes[locus] = 0.;
    
    if(world->loci==1)
        addon=0;
    else
        addon=1;
    for (worker = 1; worker < MIN (world->loci + addon, numcpu); worker++)
    {
        MYMPISEND (temp, (MYINT) numelem2+2, MPI_DOUBLE, (MYINT) worker, (MYINT) worker, comm_world);
    }
    for (worker = 1; worker < MIN (world->loci + addon, numcpu); worker++)
    {
        MYMPIRECV (tmp, (MYINT) world->loci, MPI_DOUBLE, MPI_ANY_SOURCE,
                  MPI_ANY_TAG, comm_world, &status);
        sender = status.MPI_SOURCE;
        tag = status.MPI_TAG;
        // the worker send a vector of values
        // e.g. (0,0,0,-12,0,-34,0,0,0), these are all loci
        // of which most of them were not evaluated
        // t he loop updates the master copy of locilikes
        for (locus = 0; locus < world->loci; locus++)
        {
            nr->locilikes[locus] += tmp[locus];
        }
    }
    for (locus = 0; locus < world->loci; locus++)
    {
        logres += nr->locilikes[locus];
    }
    nr->llike = logres;
    free (temp);
    free (tmp);
    return logres;
}


void
mpi_likelihood_worker (world_fmt * world, helper_fmt * helper, long rep)
{
    long locus, ww;
    nr_fmt *nr = helper->nr;
    MYREAL *param = helper->expxv;
    MYREAL *lparam = helper->xv;
    MYREAL *mu_rates = world->options->mu_rates;
    memset (nr->locilikes, 0, sizeof (MYREAL) * world->loci);
    if (world->options->gamma)
    {
        if (lparam[nr->numpop2] > 9.903487553)
        {
            lparam[nr->numpop2] = 9.903487553;
        }
        initgammacat (nr->categs, EXP (lparam[nr->numpop2]),1./* EXP (lparam[0])*/,
                      nr->rate, nr->probcat);
    }

    for (ww = 0; ww < locidone; ww++)
    {
        locus = nr->world->who[ww];
        if (!world->options->gamma)
        {
            nr->locilikes[locus] =
                calc_locus_like (nr, param, lparam, locus) + mu_rates[locus];
        }
        else
        {
            helper->locus = locus;
            nr->locilikes[locus] = gamma_locus_like (nr,param,lparam,helper->weight,locus);
        }
    }
}

void
mpi_startparam_master(world_fmt * world)
{
    long sender;
    MPI_Status status;
    MYREAL  *tmp;
    int tag;
    long i;
    int numreceived = 0;
    long workerloci=0;
    // MYMPIBARRIER(comm_world);
    tmp = (MYREAL*) mycalloc(world->numpop2+1,sizeof(MYREAL));
    while (numreceived < world->loci)
      {
       // printf("wait for data \n");
        MYMPIRECV (tmp, world->numpop2+1, MPI_DOUBLE, MPI_ANY_SOURCE,
                   MPI_ANY_TAG, comm_world, &status);
        sender = status.MPI_SOURCE;
        tag = status.MPI_TAG;
        //printf("received data form node %li with tag %i\n",sender,tag);
        workerloci=tmp[0];
        for(i=0; i<world->numpop2; i++)
            world->param0[i] += tmp[i+1];
        numreceived+=workerloci;
      }
    for(i=0; i<world->numpop2; i++)
        world->param0[i] /= world->loci;
 
    free(tmp);
}

void
mpi_startparam_worker (world_fmt * world)
{
    long ww;
    long repstart;
    long repstop;
    long r;
    long i;
    long locus;
    MYREAL *tmp;
    // MYMPIBARRIER(comm_world);
    if(locidone>0)
      {
//        fprintf("send startparam from node %i\n",myID);
    tmp = (MYREAL*) mycalloc(world->numpop2+1,sizeof(MYREAL));
        set_replicates (world, world->repkind, world->options->replicatenum,
                        &repstart, &repstop);
        tmp[0]=(MYREAL)locidone;
        for (ww = 0; ww < locidone; ww++)
        {
            locus = world->who[ww];
            for (r = repstart; r < repstop; r++)
            {
                for(i=0; i < world->numpop2; i++)
                    tmp[i+1] += world->atl[r][locus].param[i];
            }
        }
        for(i=1; i < world->numpop2+1; i++)
            tmp[i] /= locidone * (repstop-repstart);
    MYMPISEND (tmp, world->numpop2+1, MPI_DOUBLE, MASTER, myID, comm_world);
    free(tmp);
}
}


void
mpi_gmax_master (world_fmt * world, long *Gmax)
{
    int sender;
    MPI_Status status;
    long tmp;
    int tag;
    int numreceived = 0;
    *Gmax = 0.;
    MYMPIBCAST (Gmax, ONE, MPI_LONG, MASTER, comm_world);
    while (numreceived < numcpu - 1)
    {
        MYMPIRECV (&tmp, ONE, MPI_LONG, MPI_ANY_SOURCE,
                  MPI_ANY_TAG, comm_world, &status);
        sender = status.MPI_SOURCE;
        tag = status.MPI_TAG;
        // fprintf(stdout,"%i> received gmax=%li with tag %i from sender %i\n",myID, *Gmax, tag, sender);
        if (*Gmax < tmp)
            *Gmax = tmp;
        numreceived++;
    }
    //  do we need this barrier really?
    MYMPIBARRIER(comm_world);
}

void
mpi_gmax_worker (world_fmt * world)
{
    long ww;
    long repstart;
    long repstop;
    long r;
    long locus;
    long Gmax = 1;
    MYMPIBCAST (&Gmax, ONE, MPI_LONG, MASTER, comm_world);
    set_replicates (world, world->repkind, world->options->replicatenum,
                    &repstart, &repstop);

    for (ww = 0; ww < locidone; ww++)
    {
        locus = world->who[ww];
        for (r = repstart; r < repstop; r++)
        {
            if (Gmax < world->atl[r][locus].T)
                Gmax = world->atl[r][locus].T;
        }
    }
    MYMPISEND (&Gmax, ONE, MPI_LONG, MASTER, myID, comm_world);
//  do we need this barrier really?
    MYMPIBARRIER(comm_world);
  }

///
/// first worker (myID=1, myRepId=0) will send stop-message to replication nodes
/// the comm_worker group's master is the first worker who has ID=0 in this group
/// as results we send messages to id=1..x in the comm_worker group, do not mix this 
/// with the id in comm_world that include the master (id=0 there).
long  mpi_send_stop_mcmc_worker_orig(long numcpu, long loci, MPI_Comm *comm, MPI_Request *irequests, MPI_Status *istatus, long id)
{
    long sent = 0;
    long receiver;
    long xx = (id==0) ? 0 : 1;
//    long minnodes = numcpu - loci -1;
    long twolongs[2];
    long *temp;
    temp = (long *) calloc(TWO, sizeof(long));
    twolongs[0]=0;
    twolongs[1]=0;
    for(receiver=loci+1-xx; receiver< numcpu-xx; receiver++)
    {
      //fprintf(stdout,"%i> About to stop node %li\n",myID, receiver+xx); fflush(stdout);
      MYMPIISEND (temp, TWO, MPI_LONG, (MYINT) receiver, 0, *comm, &irequests[sent]);
        sent++;
    }
    if(sent>0)
      MYMPIWAITALL(sent,irequests, istatus); // wait for all replicators to finish
    free(temp);
    return sent;
}

///
/// first worker (myID=1, myRepId=0) will send stop-message to replication nodes
/// the comm_worker group's master is the first worker who has ID=0 in this group
/// as results we send messages to id=1..x in the comm_worker group, do not mix this 
/// with the id in comm_world that include the master (id=0 there).
long  mpi_send_stop_mcmc_lociworker(long numcpu, long loci)
{
    long sent = 0;
    long receiver;
    long xx = (myID==0) ? 0 : 1;
    long minnodes = MIN(numcpu,loci -1);
    long *temp;

    MPI_Request *irequests;
    MPI_Status *istatus;
    irequests = (MPI_Request *) mycalloc(minnodes,sizeof(MPI_Request));
    istatus = (MPI_Status *) mycalloc(minnodes,sizeof(MPI_Status));

    temp = (long *) calloc(TWO, sizeof(long));

    for(receiver=loci+1-xx; receiver< numcpu-xx; receiver++)
    {
      //fprintf(stdout,"%i> About to stop node %li\n",myID, receiver+xx); fflush(stdout);
      MYMPIISEND (temp, TWO, MPI_LONG, (MYINT) receiver, 0, comm_workers, &irequests[sent]);
        sent++;
    }
    if(sent>0)
      MYMPIWAITALL(sent,irequests, istatus); // wait for all replicators to finish
    free(temp);
    free(irequests);
    free(istatus);
    return sent;
}
long  mpi_send_stop_mcmc_replicateworker(long numcpu, long loci)
{
    long sent = 0;
    long receiver;
    long xx = (myID==0) ? 0 : 1;
    long minnodes = labs(numcpu - loci -1);
    long *temp;

    MPI_Request *irequests;
    MPI_Status *istatus;
    irequests = (MPI_Request *) mycalloc(minnodes,sizeof(MPI_Request));
    istatus = (MPI_Status *) mycalloc(minnodes,sizeof(MPI_Status));

    temp = (long *) calloc(TWO, sizeof(long));

    for(receiver=loci+1-xx; receiver< numcpu-xx; receiver++)
    {
      //fprintf(stdout,"%i> About to stop node %li\n",myID, receiver+xx); fflush(stdout);
      MYMPIISEND (temp, 2, MPI_LONG, (MYINT) receiver, 0, comm_workers, &irequests[sent]);
        sent++;
    }
    if(sent>0)
      MYMPIWAITALL(sent,irequests, istatus); // wait for all replicators to finish
    free(temp);
    free(irequests);
    free(istatus);
    return sent;
}

void
mpi_send_stop (world_fmt * world)
{
    long worker;
    MYREAL *temp;
    long numelem = world->numpop2 + (world->options->gamma ? 1 : 0);
    long numelem2 = 2 * numelem;
    temp = (MYREAL *) mycalloc (numelem2+2, sizeof (MYREAL));
    temp[0] = MIGMPI_END;
    for (worker = 1; worker < numcpu; worker++)
    {
        MYMPISEND (temp, (MYINT) numelem2+2, MPI_DOUBLE, (MYINT) worker, (MYINT) 0, comm_world); //end of loci
#ifdef DEBUG_MPI
        FPRINTF(stdout,"%i> sent stop to %i\n",myID, worker);
#endif
    }
    free (temp);
}

void
mpi_send_stop_tag (int worker, world_fmt * world)
{
    MYREAL *temp;
    long numelem = world->numpop2 + (world->options->gamma ? 1 : 0);
    long numelem2 = 2 * numelem;
    temp = (MYREAL *) mycalloc (numelem2+2, sizeof (MYREAL));
    temp[0] = MIGMPI_END;
    MYMPISEND (temp, (MYINT) numelem2+2, MPI_DOUBLE, (MYINT) worker, (MYINT) 0, comm_world); //end of loci
    free (temp);
}

void
mpi_results_stop (void)
{
    long worker;
    long dummy = 0;
    for (worker = 1; worker < numcpu; worker++)
    {
        MYMPISEND (&dummy, ONE, MPI_LONG, (MYINT) worker, (MYINT) 0, comm_world);
    }
}

void
mpi_gradient_master (nr_fmt * nr, world_fmt * world, int *who)
{
    long locus;
    long addon=1;
    long sender;
    MPI_Status status;
    int tag;

    MYREAL *temp;
    long *tempindex;
    long numelem = nr->partsize;
    long numelem2 = 2 * numelem;
    temp = (MYREAL *) mycalloc (numelem2+2, sizeof (MYREAL));
    tempindex = (long *) mycalloc (numelem, sizeof (long));
    temp[0] = MIGMPI_GRADIENT;
    memcpy (temp + 1, nr->param, numelem * sizeof (MYREAL));
    memcpy (tempindex, nr->indeks, nr->partsize * sizeof (long));
    memcpy (temp + 1 + numelem, nr->lparam, numelem * sizeof (MYREAL));
    temp[numelem2+1] = nr->profilenum;
    addon = (world->loci == 1)? 0 : 1;
    for (locus = 1; locus < MIN (world->loci + addon, numcpu); locus++)
    {
        MYMPISEND (temp, (MYINT) numelem2+2, MPI_DOUBLE, (MYINT) locus, (MYINT) locus, comm_world);
        MYMPISEND (tempindex, (MYINT) numelem, MPI_LONG, (MYINT) locus, (MYINT) locus, comm_world);
    }
    memset (nr->d, 0, sizeof (MYREAL) * numelem);
    for (locus = 1; locus < MIN (world->loci + addon, numcpu); locus++)
    {
        copy_and_clear_d (nr);
        MYMPIRECV (nr->d, (MYINT) numelem, MPI_DOUBLE, MPI_ANY_SOURCE,
                  MPI_ANY_TAG, comm_world, &status);
        add_back_d (nr);
        sender = status.MPI_SOURCE;
        tag = status.MPI_TAG;
    }
    free (temp);
}

void
mpi_gradient_worker (helper_fmt * helper, nr_fmt * nr,
                     timearchive_fmt ** tyme)
{
    long ww, locus;
    memset (nr->d, 0, sizeof (MYREAL) * nr->partsize);
    for (ww = 0; ww < locidone; ww++)
    {
        locus = nr->world->who[ww];
        if(!nr->world->options->gamma)
        {
            copy_and_clear_d (nr);
            simple_loci_derivatives (nr->d, nr, tyme, locus);
            add_back_d (nr);
        }
        else
        {
            gamma_locus_derivative (helper, locus);
        }
    }
}

void
mpi_maximize_worker (world_fmt * world, long kind, long rep)
{
    boolean done = FALSE;
    long locus;
    MPI_Status status;
    nr_fmt *nr;
    helper_fmt helper;
    long repstart, repstop, Gmax;
    long numelem =  world->numpop2 + (world->options->gamma ?  1 : 0) ;
    long numelem2 = numelem * 2;
    MYREAL *temp;
    temp = (MYREAL *) mycalloc (numelem2 + 2, sizeof (MYREAL));
    helper.xv = (MYREAL *) mycalloc (numelem2, sizeof (MYREAL));
    helper.expxv = (MYREAL *) mycalloc (numelem2, sizeof (MYREAL));
    nr = (nr_fmt *) mycalloc (1, sizeof (nr_fmt));
    set_replicates (world, world->repkind, rep, &repstart, &repstop);

    which_calc_like (world->repkind);
    MYMPIBCAST (&Gmax, 1, MPI_LONG, MASTER, comm_world);
    create_nr (nr, world, Gmax, 0, world->loci, world->repkind, repstart);
    SETUPPARAM0 (world, nr, world->repkind,
                 repstart, repstop, world->loci, kind, TRUE);
    while (!done)
    {
        MYMPIRECV (temp, (MYINT) numelem2+2, MPI_DOUBLE, MASTER, MPI_ANY_TAG,
                  comm_world, &status);
        locus = world->locus = status.MPI_TAG - 1;
        switch ((long) temp[0])
        {
        case MIGMPI_LIKE:
        //    fprintf (stdout, "%i> maximize_worker will send logL\n", myID);
            memset (nr->locilikes, 0, sizeof (MYREAL) * (world->loci+1));
            memcpy (helper.expxv, temp + 1, sizeof (MYREAL) * numelem);
            memcpy (helper.xv, temp + 1 + numelem,
                    sizeof (MYREAL) * numelem);
            fill_helper (&helper, helper.expxv, helper.xv, world, nr);
            mpi_likelihood_worker (world, &helper, rep);
            MYMPISEND (nr->locilikes, (MYINT) world->loci, MPI_DOUBLE, (MYINT) MASTER,
                      (MYINT) (locus + 1), comm_world);
            break;
        case MIGMPI_GRADIENT:
         //   fprintf (stdout, "%i> maximize_worker will send gradient\n", myID);
            memcpy (nr->param, temp + 1, sizeof (MYREAL) * (numelem - 1));
            memcpy (nr->lparam, temp + 1 + numelem,
                    sizeof (MYREAL) * numelem);
            fill_helper (&helper, nr->param, nr->lparam, world, nr);
            nr->profilenum = temp[numelem2 + 1];
            MYMPIRECV (nr->indeks, (MYINT) numelem, MPI_LONG, MASTER, (MYINT) (locus+1), // was ????????? MPI_ANY_TAG,
                      comm_world, &status);
            mpi_gradient_worker (&helper, nr, world->atl);
            MYMPISEND (nr->d, (MYINT) numelem, MPI_DOUBLE, (MYINT) MASTER, (MYINT) (locus + 1),
                      comm_world);
            break;
        case MIGMPI_RESULT:
            //fprintf (stdout, "%i> maximize_worker will send results\n", myID);
            mpi_results_worker (temp[0], world, repstop, pack_result_buffer);
            break;
        case MIGMPI_SUMFILE:
            //fprintf (stdout, "%i> maximize_worker will send sumfiles\n", myID);
            mpi_results_worker (temp[0], world, repstop, pack_sumfile_buffer);
            break;
        case MIGMPI_MIGHIST:
            mpi_results_worker (temp[0], world, repstop, pack_mighist_buffer);
            break;
        case MIGMPI_BAYESHIST:
            mpi_results_worker (temp[0], world, repstop, pack_bayes_buffer);
            break;
        case MIGMPI_END:
            //fprintf (stdout, "%i> maximize_worker received stop\n", myID);
            done = TRUE;
            break;
        default:
            fprintf (stdout, "%i> does not understand task\n", myID);
            exit (0);
        }
    }
    free(helper.xv);
    free(helper.expxv);
    destroy_nr (nr, world);
}

void
broadcast_options_master (option_fmt * options)
{
    long bufsize = MAXBUFSIZE;
    char *buffer;
    buffer = (char *) mycalloc (1, sizeof (char));
    bufsize = save_options_buffer (&buffer, options);
    // MYMPIBARRIER(comm_world);
    MYMPIBCAST (&bufsize, 1, MPI_LONG, MASTER, comm_world);
    MYMPIBCAST (buffer, bufsize, MPI_CHAR, MASTER, comm_world);
    free (buffer);
}


void
broadcast_options_worker (option_fmt * options)
{
    long bufsize;
    char *buffer, *sbuffer;
    // MYMPIBARRIER(comm_world);
    MYMPIBCAST (&bufsize, 1, MPI_LONG, MASTER, comm_world);
    buffer = (char *) mycalloc (bufsize + 1, sizeof (char));
    sbuffer = buffer;
    MYMPIBCAST (buffer, bufsize, MPI_CHAR, MASTER, comm_world);
    read_options_worker (&buffer, options);
    free (sbuffer);
}


void
broadcast_data_master (data_fmt * data, option_fmt * options)
{
    long bufsize;
    char *buffer;
    buffer = (char *) mycalloc (1, sizeof (char));
    bufsize = pack_databuffer (&buffer, data, options);
    // MYMPIBARRIER(comm_world);
    bufsize = (long) strlen(buffer)+1;
    MYMPIBCAST (&bufsize, 1, MPI_LONG, MASTER, comm_world);
    MYMPIBCAST (buffer, bufsize, MPI_CHAR, MASTER, comm_world);
    free (buffer);
}

void
broadcast_data_worker (data_fmt * data, option_fmt * options)
{
    long bufsize;
    char *buffer;
    // MYMPIBARRIER(comm_world);
    MYMPIBCAST (&bufsize, 1, MPI_LONG, MASTER, comm_world);
    buffer = (char *) mycalloc (bufsize, sizeof (char));
    MYMPIBCAST (buffer, bufsize, MPI_CHAR, MASTER, comm_world);
    unpack_databuffer (buffer, data, options);
    free (buffer);
}

long
pack_databuffer (char **buffer, data_fmt * data, option_fmt * options)
{
    long locus, pop, ind;
    long bufsize = 0;
    long biggest;
#ifdef UEP

    long sumtips, i;
#endif

    char fp[LONGLINESIZE];
    bufsize += LONGLINESIZE;
    *buffer = (char *) myrealloc (*buffer, sizeof (char) * bufsize);
    bufsize += 1 + sprintf (fp, "%c %li %li %li\n", options->datatype, (long) data->hasghost,
                            data->numpop, data->loci);
    *buffer = (char *) myrealloc (*buffer, sizeof (char) * bufsize);
    strcat (*buffer, fp);
    for (locus = 0; locus < data->loci; locus++)
    {
        bufsize += 1 + sprintf (fp, "%li\n", data->seq->sites[locus]);
        *buffer = (char *) myrealloc (*buffer, sizeof (char) * bufsize);
        strcat (*buffer, fp);
    }
    bufsize += 1 + sprintf (fp, "%li %f\n", data->seq->addon, data->seq->fracchange);
    *buffer = (char *) myrealloc (*buffer, sizeof (char) * bufsize);
    strcat (*buffer, fp);
    // population data
    for (pop = 0; pop < data->numpop; pop++)
    {
        bufsize += 1 + sprintf (fp, "%s\n", data->popnames[pop]);
        *buffer = (char *) myrealloc (*buffer, sizeof (char) * bufsize);
        strcat (*buffer, fp);
        biggest = 0;
        for(locus=0; locus<data->loci; locus++)
        {
            bufsize += 1 + sprintf (fp, "%li %li\n", data->numind[pop][locus],data->numalleles[pop][locus]);
            if(biggest < data->numind[pop][locus])
                biggest = data->numind[pop][locus];
            *buffer = (char *) myrealloc (*buffer, sizeof (char) * bufsize);
            strcat (*buffer, fp);
        }
        if (!strchr (SEQUENCETYPES, options->datatype))
        {
            for (ind = 0; ind < biggest; ind++)
            {
                bufsize += 1 + sprintf (fp, "%*.*s\n", (int) options->nmlength,
                                        (int) options->nmlength, data->indnames[pop][ind]);
                *buffer = (char *) myrealloc (*buffer, sizeof (char) * bufsize);
                strcat (*buffer, fp);
                pack_allele_data (buffer, &bufsize, data, pop, ind);
            }
        }
        else
        {
            for(locus=0;locus<data->loci; ++locus)
            {
                for (ind = 0; ind < data->numind[pop][locus]; ind++)
                {
                    bufsize += 1 + sprintf (fp, "%*.*s\n", (int) options->nmlength, (int) options->nmlength,
                                            data->indnames[pop][ind]);
                    *buffer = (char *) myrealloc (*buffer, sizeof (char) * bufsize);
                    strcat (*buffer, fp);
                    pack_sequence_data (buffer, &bufsize, data, pop, ind, locus);
                }
            }
        }
    }
    // geofile
    if (options->geo)
    {
        for (pop = 0; pop < data->numpop * data->numpop; pop++)
        {
            bufsize += 1 + sprintf (fp, "%f %f\n", data->geo[pop], data->lgeo[pop]);
            *buffer = (char *) myrealloc (*buffer, sizeof (char) * bufsize);
            strcat (*buffer, fp);
        }
    }
    // uepfile
#ifdef UEP
    if (options->uep)
    {
        sumtips = 0;
        for (pop = 0; pop < data->numpop; ++pop)
            sumtips += data->numind[pop][0];//Assumes UEP is matched by locus 1
        bufsize += 1 + sprintf (fp, "%li %li\n", sumtips, data->uepsites);
        *buffer = (char *) myrealloc (*buffer, sizeof (char) * bufsize);
        strcat (*buffer, fp);
        if (strchr (SEQUENCETYPES, options->datatype))
        {
            for (pop = 0; sumtips; pop++)
            {
                for (i = 0; i < data->uepsites; i++)
                {
                    bufsize += 1 + sprintf (fp, "%i\n", data->uep[pop][i]);
                    *buffer =
                        (char *) myrealloc (*buffer, sizeof (char) * bufsize);
                    strcat (*buffer, fp);
                }
            }
        }
        else
        {
            for (pop = 0; sumtips; pop++)
            {
                for (i = 0; i < data->uepsites; i++)
                {
                    bufsize += 1 + sprintf (fp, "%i %i\n", data->uep[pop][i],
                                            data->uep[pop + sumtips][i]);
                    *buffer =
                        (char *) myrealloc (*buffer, sizeof (char) * bufsize);
                    strcat (*buffer, fp);
                }
            }
        }
    }

#endif
    return bufsize;
}

void
pack_allele_data (char **buffer, long *bufsize, data_fmt * data, long pop,
                  long ind)
{
    char fp[LONGLINESIZE];
    long locus;
    for (locus = 0; locus < data->loci; locus++)
    {
        *bufsize += 1 + sprintf (fp, "%s %s\n", data->yy[pop][ind][locus][0],
                                 data->yy[pop][ind][locus][1]);
        *buffer = (char *) myrealloc (*buffer, sizeof (char) * *bufsize);
        strcat (*buffer, fp);
    }
}

void
pack_sequence_data (char **buffer, long *bufsize, data_fmt * data, long pop,
                    long ind, long locus)
{
    char *fp;
    //  long locus;
    //  fp = mycalloc (1, sizeof (char));
    // for (locus = 0; locus < data->loci; locus++)
    //   {
    fp = (char *) mycalloc ((2 + data->seq->sites[locus]), sizeof (char));
    sprintf (fp, "%s\n", data->yy[pop][ind][locus][0]);
    *bufsize += 2 + data->seq->sites[locus];
    *buffer = (char *) myrealloc (*buffer, sizeof (char) * *bufsize);
    strcat (*buffer, fp);
    //   }
    free (fp);
}

// this function and get_data() do not mix well!
void
unpack_databuffer (char *buffer, data_fmt * data, option_fmt * options)
{
    long locus, pop, ind, i=0;
    long biggest;
#ifdef UEP

    long sumtips;
#endif

    char *buf = buffer;
    char *input;
    long hasghost;
    input = (char *) mycalloc (LONGLINESIZE, sizeof (char));
    sgets (input, LONGLINESIZE, &buf);
    sscanf (input, "%c%li%li%li", &options->datatype, &hasghost, &data->numpop,
            &data->loci);
    data->hasghost = (boolean) hasghost;
    init_data_structure1 (&data, options);
    for (locus = 0; locus < data->loci; locus++)
    {
        sgets (input, LONGLINESIZE, &buf);
        sscanf (input, "%li", &data->seq->sites[locus]);
    }
    sgets (input, LONGLINESIZE, &buf);
    sscanf (input, "%li%lf", &data->seq->addon, &data->seq->fracchange);
    // population data
    for (pop = 0; pop < data->numpop; pop++)
    {
        sgets (input, LONGLINESIZE, &buf);
        sscanf (input, "%s", data->popnames[pop]);
        biggest=0;
        for(locus=0; locus<data->loci; locus++)
        {
            sgets (input, LONGLINESIZE, &buf);
            sscanf (input, "%li %li", &data->numind[pop][locus],&data->numalleles[pop][locus]);
            if(biggest<data->numind[pop][locus])
                biggest = data->numind[pop][locus];
        }
        init_data_structure2 (&data, options, pop);
        if (!strchr (SEQUENCETYPES, options->datatype))
        {
            for (ind = 0; ind < biggest; ind++)
            {
                sgets (input, LONGLINESIZE, &buf);
                sscanf (input, "%s", data->indnames[pop][ind]);
                for (locus = 0; locus < data->loci; locus++)
                {
                    sgets (input, LONGLINESIZE, &buf);
                    sscanf (input, "%s %s", data->yy[pop][ind][locus][0],
                            data->yy[pop][ind][locus][1]);
                }
            }
        }
        else
        {
            for (locus = 0; locus < data->loci; locus++)
            {
                for (ind = 0; ind < data->numind[pop][locus]; ind++)
                {
                    sgets (input, LONGLINESIZE, &buf);
                    sscanf (input, "%s", data->indnames[pop][ind]);
                    input =(char *) myrealloc (input, sizeof (char) * (100 + data->seq->sites[locus]));
                    sgets (input, 100 + data->seq->sites[locus], &buf);
                    sscanf (input, "%s", data->yy[pop][ind][locus][0]);
                }
            }
        }
    }
    // geofile
    data->geo =
        (MYREAL *) mycalloc (1, sizeof (MYREAL) * data->numpop * data->numpop);
    data->lgeo =
        (MYREAL *) mycalloc (1, sizeof (MYREAL) * data->numpop * data->numpop);
    if (!options->geo)
    {
        for (i = 0; i < data->numpop * data->numpop; i++)
            data->geo[i] = 1.0;
    }
    else
    {
        for (pop = 0; pop < data->numpop * data->numpop; pop++)
        {
            sgets (input, LONGLINESIZE, &buf);
            sscanf (input, "%lf%lf", &data->geo[pop], &data->lgeo[pop]);
        }
    }
    // uepfile
#ifdef UEP
    if (options->uep)
    {
        sgets (input, LONGLINESIZE, &buf);
        sscanf (input, "%li%li", &sumtips, &data->uepsites);
        data->uep =
            (int **) mycalloc (number_genomes (options->datatype) * sumtips,
                             sizeof (int *));
        if (strchr (SEQUENCETYPES, options->datatype))
        {
            for (pop = 0; sumtips; pop++)
            {
                data->uep[i] = (int *) mycalloc (data->uepsites, sizeof (int));
                for (i = 0; i < data->uepsites; i++)
                {
                    sgets (input, LONGLINESIZE, &buf);
                    sscanf (input, "%i", &data->uep[pop][i]);
                }
            }
        }
        else
        {
            for (pop = 0; sumtips; pop++)
            {
                data->uep[i] = (int *) mycalloc (data->uepsites, sizeof (int));
                data->uep[i + sumtips] =
                    (int *) mycalloc (data->uepsites, sizeof (int));
                for (i = 0; i < data->uepsites; i++)
                {
                    sgets (input, LONGLINESIZE, &buf);
                    sscanf (input, "%i%i", &data->uep[pop][i],
                            &data->uep[pop + sumtips][i]);
                }
            }
        }

    }
#endif
    init_data_structure3 (data);

    switch (options->datatype)
    {
    case 'a':
        create_alleles (data);
        break;
    case 'b':
        for (pop = 0; pop < data->loci; pop++)
            data->maxalleles[pop] = XBROWN_SIZE;
        break;
    case 'm':
        create_alleles (data);
        for (pop = 0; pop < data->loci; pop++)
            data->maxalleles[pop] = options->micro_stepnum;
        break;
    }
    free (input);
}

void
unpack_result_buffer (char *buffer, world_fmt * world,
                      long locus, long maxrep, long numpop)
{
    long rep, pop;
    long addon=0;
    timearchive_fmt **atl = world->atl;
    MYREAL ***apg0 = world->apg0;
    char *input;
    char *buf = buffer;
    input = (char*) mycalloc(LONGLINESIZE,sizeof(char));
    if (maxrep>1)
        addon=1;
    for (rep = 0; rep < maxrep + addon; rep++)
    {
        sgets (input, LONGLINESIZE, &buf);
        atl[rep][locus].param_like = atof (input);
        for (pop = 0; pop < 4 * numpop * numpop; pop++)
        {
            sgets (input, LONGLINESIZE, &buf);
            atl[rep][locus].parameters[pop] = atof (input);
        }
    }
    // apg0
    for (rep = 0; rep < maxrep; rep++)
    {
        for (pop = 0; pop < world->options->lsteps; pop++)
        {
            sgets (input, LONGLINESIZE, &buf);
            apg0[rep][locus][pop] = atof (input);
        }
    }
    free(input);
}

long
pack_result_buffer (char **buffer, world_fmt * world,
                    long locus, long maxrep, long numpop)
{
    long rep, pop;
    char *input;
    long addon=0;
    timearchive_fmt **atl = world->atl;
    MYREAL ***apg0 = world->apg0;
    long bufsize = 1;  //maxrep * (1 + 4*numpop*numpop + world->options->lsteps) * 42 ;
    input = (char*) mycalloc(LONGLINESIZE,sizeof(char));
    (*buffer) = (char *) myrealloc (*buffer, sizeof (char) * bufsize);
    memset (*buffer, 0, sizeof (char) * bufsize);
    strcpy (input, "");
    if (maxrep>1)
        addon=1;

    for (rep = 0; rep < maxrep + addon; rep++)
    {
        bufsize += 1 + sprintf (input, "%20.20f\n", atl[rep][locus].param_like);
        (*buffer) = (char *) myrealloc (*buffer, sizeof (char) * bufsize);
        strcat (*buffer, input);
        for (pop = 0; pop < 4 * numpop * numpop; pop++)
        {
            bufsize += 1 + sprintf (input, "%20.20f\n", atl[rep][locus].parameters[pop]);
            (*buffer) = (char *) myrealloc (*buffer, sizeof (char) * bufsize);
            strcat (*buffer, input);
        }
    }
    // apg0
    for (rep = 0; rep < maxrep; rep++)
    {
        for (pop = 0; pop < world->options->lsteps; pop++)
        {
            bufsize += 1 + sprintf (input, "%20.20f\n", apg0[rep][locus][pop]);
            (*buffer) = (char *) myrealloc (*buffer, sizeof (char) * bufsize);
            strcat (*buffer, input);
        }
    }
    free(input);
    return bufsize;
}

///
/// unpacks replicate samples of migration events, adds numbers and part-vectors
/// to the final array per locus, this function is only used with replicates over
/// multiple loci.
void
unpack_mighist_replicate_buffer (char *input, char *buffer, world_fmt * world,
                       long locus, long numpop)
{
    long i, j;
    long nummighist;
    long nummighistold;
    mighistloci_fmt *aa;
    char *buf = buffer;
    aa = &world->mighistloci[locus];
    sgets (input, LONGLINESIZE, &buf);
    sscanf (input, "%li", &nummighist);
    nummighistold = aa->mighistnum;
    aa->mighistnum += nummighist;
    if(aa->allocsize <= aa->mighistnum)
    {
        aa->mighist = (mighist_fmt *) myrealloc (aa->mighist, sizeof (mighist_fmt) *(aa->mighistnum+1));
        for(j=aa->allocsize; j<=aa->mighistnum; j++)
        {
            aa->mighist[j].allocsize=1;
            aa->mighist[j].migeventsize=0;
            aa->mighist[j].migevents =
                (migevent_fmt *) mycalloc (1,  sizeof (migevent_fmt) );
            //printf("%i> first events: alloc=%li size=%li\n", myID, aa->mighist[j].allocsize , aa->mighist[j].migeventsize);
        }
        aa->allocsize = aa->mighistnum+1;
    }
    for (j = nummighistold; j < aa->mighistnum; j++)
    {
        sgets (input, LONGLINESIZE, &buf);
        sscanf (input, "%li %li %li", &aa->mighist[j].copies,
                &aa->mighist[j].weight, &aa->mighist[j].migeventsize);
        //printf("%i> events: alloc=%li size=%li\n", myID, aa->mighist[j].allocsize , aa->mighist[j].migeventsize);
        aa->mighist[j].allocsize = aa->mighist[j].migeventsize;
        aa->mighist[j].migevents = (migevent_fmt *) myrealloc (aa->mighist[j].migevents,
                                                               sizeof (migevent_fmt) *
                                                               aa->mighist[j].allocsize);
        for (i = 0; i < aa->mighist[j].migeventsize; i++)
        {
            sgets (input, LONGLINESIZE, &buf);
            sscanf (input, "%lf %lf %lf %lf",
                    &aa->mighist[j].migevents[i][0],
                    &aa->mighist[j].migevents[i][1],
                    &aa->mighist[j].migevents[i][2],
                    &aa->mighist[j].migevents[i][3]);
        }
    }
}

void
unpack_mighist_buffer (char *buffer, world_fmt * world,
                       long locus, long maxrep, long numpop)
{
    long i, j;
    mighistloci_fmt *aa;
    char *input;
    char *buf = buffer;
    aa = &world->mighistloci[locus];
    input = (char*) mycalloc(LONGLINESIZE,sizeof(char));
    sgets (input, LONGLINESIZE, &buf);
    sscanf (input, "%li", &aa->mighistnum);
    if(aa->allocsize <= aa->mighistnum)
    {
        aa->mighist = (mighist_fmt *) myrealloc (aa->mighist, sizeof (mighist_fmt) *(aa->mighistnum+1));
        for(j=aa->allocsize; j<=aa->mighistnum; j++)
        {
            aa->mighist[j].allocsize=1;
            aa->mighist[j].migeventsize=0;
            aa->mighist[j].migevents =
                (migevent_fmt *) mycalloc (1,  sizeof (migevent_fmt) );
            //printf("%i> first events: alloc=%li size=%li\n", myID, aa->mighist[j].allocsize , aa->mighist[j].migeventsize);
        }
        aa->allocsize = aa->mighistnum+1;
    }
    for (j = 0; j < aa->mighistnum; j++)
    {
        sgets (input, LONGLINESIZE, &buf);
        sscanf (input, "%li %li %li", &aa->mighist[j].copies,
                &aa->mighist[j].weight, &aa->mighist[j].migeventsize);
        //printf("%i> events: alloc=%li size=%li\n", myID, aa->mighist[j].allocsize , aa->mighist[j].migeventsize);
        aa->mighist[j].allocsize = aa->mighist[j].migeventsize;
        aa->mighist[j].migevents = (migevent_fmt *) myrealloc (aa->mighist[j].migevents,
                                   sizeof (migevent_fmt) *
                                   aa->mighist[j].allocsize);
        for (i = 0; i < aa->mighist[j].migeventsize; i++)
        {
            sgets (input, LONGLINESIZE, &buf);
            sscanf (input, "%lf %lf %lf %lf",
                    &aa->mighist[j].migevents[i][0],
                    &aa->mighist[j].migevents[i][1],
                    &aa->mighist[j].migevents[i][2],
                    &aa->mighist[j].migevents[i][3]);
        }
    }
    free(input);
}


long
pack_mighist_buffer (char **buffer, world_fmt * world,
                     long locus, long maxrep, long numpop)
{
    long j, i;
    mighistloci_fmt *aa;
    char *input;
    long bufsize = 1;
    aa = &world->mighistloci[locus];
    input = (char*) mycalloc(LONGLINESIZE,sizeof(char));
    for (j = 0; j < aa->mighistnum; j++)
        bufsize += 100 + aa->mighist[j].migeventsize * 100;
    (*buffer) = (char *) myrealloc ((*buffer), sizeof (char) * bufsize);
    memset (*buffer, 0, sizeof (char) * bufsize);
    sprintf (input, "%li\n", aa->mighistnum);
    strcat ((*buffer), input);

    for (j = 0; j < aa->mighistnum; j++)
    {
        sprintf (input, "%li %li %li\n", aa->mighist[j].copies,
                 aa->mighist[j].weight, aa->mighist[j].migeventsize);
        strcat ((*buffer), input);
        for (i = 0; i < aa->mighist[j].migeventsize; i++)
        {
            sprintf (input, "%20.20f %f %f %f\n",
                     aa->mighist[j].migevents[i][0],
                     aa->mighist[j].migevents[i][1],
                     aa->mighist[j].migevents[i][2],
                     aa->mighist[j].migevents[i][3]);
            strcat ((*buffer), input);
        }
    }
    free(input);
    return (long) strlen (*buffer) + 1;
}


///
/// unpack bayes parameters to fit into mpi_results_master()
void
unpack_bayes_buffer (char *buffer, world_fmt * world,
                       long locus, long maxrep, long numpop)
{
    unpack_hist_bayes_buffer(buffer, world->bayes, world, locus); // this combines the single_bayes and hist_bayes buffer unpacking
}

///
/// pack bayes parameters to fit into mpi_results_worker()
long 
pack_bayes_buffer (char **buffer, world_fmt * world,
                     long locus, long maxrep, long numpop)
{
    long bufsize; 
//    bufsize = pack_single_bayes_buffer(buffer,world->bayes,world,locus);
    bufsize = pack_single_bayes_buffer_part(buffer,world->bayes,world,locus);
    bufsize += pack_hist_bayes_buffer(buffer, world->bayes, world, locus);
    return bufsize;
}

///
/// pack the bayes histogram
void unpack_hist_bayes_buffer(char *buffer, bayes_fmt *bayes, world_fmt *world, long locus)
{
    long                j, i;
    long                pa;
    long                numbins = 0;
    long                pnum;
    long                tmp1, tmp2;
    long                tmplocus;
    char                *input;
    char                *buf = buffer;
    bayeshistogram_fmt  *hist;
    long                total = 0;
//    long allocparams = world->bayes->allocparams;
//    long oldallocparams = world->bayes->allocparams;

    input = (char*) mycalloc(LONGLINESIZE,sizeof(char));

    
    //
    // begin unpack_single_bayes_buffer
    sgets (input, LONGLINESIZE, &buf);
    sscanf (input, "%li %li", &tmplocus, &pnum);
    //fprintf (stdout, "%i> received pnum=%li\n", myID, pnum);

    if(tmplocus!=locus)
    {
        world->bayes->numparams=0;
        locus = tmplocus;
    }
    
 /*   pnum += bayes->numparams;
    if(pnum >=bayes->allocparams)
    {
        allocparams = pnum + 1;
        world->bayes->params = (MYREAL **) myrealloc((void *) world->bayes->params,sizeof(MYREAL*)*allocparams);
        for(i=oldallocparams;i<allocparams;i++)
            world->bayes->params[i] = (MYREAL *) mycalloc(world->numpop2+1,sizeof(MYREAL));
    }
    world->bayes->allocparams = allocparams;
    for(i = world->bayes->numparams; i <pnum; ++i)
    {
        fprintf (stdout, "%i> receive params ", myID);
        
        for (j = 0; j <= world->numpop2; ++j) // the first element is the likelihood
        {
            sgets (input, LONGLINESIZE, &buf);
            sscanf (input, "%lf", &world->bayes->params[i][j]);
            fprintf (stdout, "%f ", world->bayes->params[i][j]);
            
        }
        fprintf (stdout, "\n");
        
    }
    world->bayes->numparams = pnum;
    */
    // acceptance ratios are added to the ones we have already
    // the last acceptance is the one for the genealogies
    //fprintf(stdout,"%i> reading acceptance ratios\n",myID);
    for (j = 0; j <= world->numpop2; ++j)
    {
        sgets (input, LONGLINESIZE, &buf);
        sscanf (input, "%li %li", &tmp1, &tmp2);
        world->bayes->accept[j] += tmp1;
        world->bayes->trials[j] += tmp2;
        //fprintf (stdout, "%i> received %li %li => %li %li\n", myID, tmp1, tmp2, bayes->accept[j], bayes->trials[j]);
        
    }
    
    // end unpack_single_bayes_buffer
    //
    hist = &(bayes->histogram[locus]);
    hist->datastore = (MYREAL *) myrealloc((void *) hist->datastore, sizeof(MYREAL) * 11 * world->numpop2);

    for(i = 0; i < world->numpop2; ++i)
    {
        sgets (input, LONGLINESIZE, &buf);
        sscanf (input, "%li", &hist->bins[i]);
    }
    
    for(pa=0; pa < world->numpop2; pa++)
    {
        total += hist->bins[pa];
    }            
    doublevec1d (&hist->results, total * world->numpop2);
    hist->set95 = (char *) calloc(total * world->numpop2 * 2 + 2, sizeof(char));
    hist->set50 = hist->set95 + total * world->numpop2 + 1;
    
    for(i = 0; i < 11 * world->numpop2; ++i)
    {
        sgets (input, LONGLINESIZE, &buf);
        sscanf (input, "%lf", &hist->datastore[i]);
    }
    numbins = 0;
    for(pa=0; pa < world->numpop2; pa++)
    {
        for(i=0;i<hist->bins[pa];i++)
        {
            sgets (input, LONGLINESIZE, &buf);
            sscanf (input, "%c%c%lf", &hist->set50[numbins + i],&hist->set95[numbins + i], &hist->results[numbins + i]);
        }
        numbins += hist->bins[pa];
    }
    free(input);
}

///
/// pack the bayes histogram
long pack_hist_bayes_buffer(char **buffer, bayes_fmt *bayes, world_fmt * world, long locus)
{
    long j, i;
    long numbins = 0;
    char *input;
    bayeshistogram_fmt *hist;
    long bufsize = (long) strlen(*buffer) + 1 + 11 * world->numpop2 + world->numpop2; //buffersize calculation: datastore doubles
    input = (char*) mycalloc(LONGLINESIZE,sizeof(char));

    hist = &(bayes->histogram[locus]);

    // buffer size calculation
    for(i=0; i < world->numpop2; i++)
    {
        numbins += hist->bins[i]; //number of bins per parameter (each contains a long)
    }
    bufsize += numbins * world->numpop2;  // total bufsize for all bins and all parameters
    // allocation of the buffersize, use 20 characters per %f format, this is plenty and will
    // leave enough space for the set50/set95 char arrays, they will add 2 chars per bin and parameter
    (*buffer) = (char *) myrealloc ((*buffer), sizeof (char) * (20 * bufsize));
    //memset (*buffer, 0, sizeof (char) * bufsize);
    
    hist = &(bayes->histogram[locus]);
    
    for(i = 0; i < world->numpop2; ++i)
    {
        sprintf (input, "%li\n", hist->bins[i]);
        strcat ((*buffer), input);
    }
    
    for(i = 0; i < 11 * world->numpop2; ++i)
    {
        sprintf (input, "%f\n", hist->datastore[i]);
        strcat ((*buffer), input);
    }
    
    numbins = 0;
    for(i=0; i < world->numpop2; i++)
    {
        for(j=0;j<hist->bins[i];j++)
        {
            sprintf (input, "%c%c%f\n",  hist->set50[numbins + j], hist->set95[numbins + j], hist->results[numbins + j]);                     
             strcat ((*buffer), input);
        }
    numbins += hist->bins[i];
    }

    free(input);
    return (long) strlen(*buffer)+1;
}


///
/// unpack bayes parameter buffer, sent from replicant nodes AND lociworker to the master
/// the values will be simply added to the bayes->params, no records of replicates will be done.
void unpack_single_bayes_buffer(char *buffer,bayes_fmt * bayes, world_fmt * world,long locus)
{
    long i, j;
    char *input;
    char *buf = buffer;
    long pnum;
    long tmp1, tmp2;
    long tmplocus;
    long allocparams = world->bayes->allocparams;
    long oldallocparams = world->bayes->allocparams;
    
    input = (char*) mycalloc(LONGLINESIZE,sizeof(char));

    sgets (input, LONGLINESIZE, &buf);
    sscanf (input, "%li %li", &tmplocus, &pnum);
    //fprintf (stdout, "%i> received locus=%li (oldlocus=%li) pnum=%li\n", myID, tmplocus, locus, pnum);
    if(tmplocus!=locus)
        world->bayes->numparams=0;
    
    pnum += world->bayes->numparams;
    if(pnum >=world->bayes->allocparams)
    {
        allocparams = pnum + 1;
        world->bayes->params = (MYREAL **) myrealloc((void *) world->bayes->params,sizeof(MYREAL*)*allocparams);
        for(i=oldallocparams;i<allocparams;i++)
            world->bayes->params[i] = (MYREAL *) mycalloc(world->numpop2+1,sizeof(MYREAL));
    }
    world->bayes->allocparams = allocparams;
    for(i = world->bayes->numparams; i <pnum; ++i)
    {
//        fprintf (stdout, "%i> receive params ", myID);

        for (j = 0; j <= world->numpop2; ++j) // the first element is the likelihood
        {
            sgets (input, LONGLINESIZE, &buf);
            sscanf (input, "%lf", &world->bayes->params[i][j]);
  //          fprintf (stdout, "%f ", world->bayes->params[i][j]);

        }
 //       fprintf (stdout, "\n");

    }
    world->bayes->numparams = pnum;
    // acceptance ratios are added to the ones we have already
    // the last acceptance is the one for the genealogies
    //fprintf(stdout,"%i> reading acceptance ratios\n",myID);
    for (j = 0; j <= world->numpop2; ++j)
    {
        sgets (input, LONGLINESIZE, &buf);
        sscanf (input, "%li %li", &tmp1, &tmp2);
        world->bayes->accept[j] += tmp1;
        world->bayes->trials[j] += tmp2;
  //      fprintf (stdout, "%i> received %li %li => %li %li\n", myID, tmp1, tmp2, bayes->accept[j], bayes->trials[j]);

    }
    
    free(input);
}

///
/// Pack bayes parameter buffer, sent from replicant nodes AND lociworker to the master
/// the values will be simply added to the bayes->params, no records of replicates will be done.
long pack_single_bayes_buffer(char **buffer, bayes_fmt *bayes, world_fmt *world,long locus)
{
    long i, j;
    char *input;
    input = (char*) mycalloc(LONGLINESIZE,sizeof(char));
    (*buffer) = (char *) myrealloc(*buffer, sizeof(char) * 100 * (world->bayes->numparams * world->numpop2 + world->numpop2 + 2));

    sprintf (input, "%li %li\n", locus, bayes->numparams);
    strcat ((*buffer), input);
        
    for(i = 0; i < world->bayes->numparams; ++i)
    {
//        fprintf (stdout, "%i> send params ", myID);

        for (j = 0; j <= world->numpop2; ++j) //the first element is the log(likelihood)
        {
            sprintf (input, "%f\n", bayes->params[i][j]);
            strcat ((*buffer), input);
  //          fprintf (stdout, "%f ", world->bayes->params[i][j]);
            
        }
    //    fprintf (stdout, "\n");
    }
//    fprintf(stdout,"%i> sending acceptance ratios\n",myID);
    for (j = 0; j <= world->numpop2; ++j)
    {
        sprintf (input, "%li %li\n", bayes->accept[j], bayes->trials[j]);
     //   fprintf (stdout, "%i> send %li %li\n", myID, bayes->accept[j], bayes->trials[j]);
        strcat ((*buffer), input);
    }
    free(input);
    return (long) strlen(*buffer)+1;
}
///
/// Pack bayes parameter buffer, sent from replicant nodes AND lociworker to the master
/// the values will be simply added to the bayes->params, no records of replicates will be done.
long pack_single_bayes_buffer_part(char **buffer, bayes_fmt *bayes, world_fmt *world,long locus)
{
    long j;
    char *input;
    input = (char*) mycalloc(LONGLINESIZE,sizeof(char));
    (*buffer) = (char *) myrealloc(*buffer, sizeof(char) * 100 * (world->bayes->numparams * world->numpop2 + world->numpop2 + 2));
    memset (*buffer, 0,sizeof(char) * 100 * (world->bayes->numparams * world->numpop2 + world->numpop2 + 2));

    sprintf (input, "%li %li\n", locus, bayes->numparams);
    strcat ((*buffer), input);
    //    fprintf(stdout,"%i> sending acceptance ratios\n",myID);
    for (j = 0; j <= world->numpop2; ++j)
    {
        sprintf (input, "%li %li\n", bayes->accept[j], bayes->trials[j]);
        //   fprintf (stdout, "%i> send %li %li\n", myID, bayes->accept[j], bayes->trials[j]);
        strcat ((*buffer), input);
    }
    free(input);
    return (long) strlen(*buffer)+1;
}

///
/// unpack minimal statistic trees
/// \todo there are differences between unpack_sumfile() and and read_savesum() this needs reconciliation
void
unpack_sumfile_buffer (char *buffer, world_fmt * world,
                       long locus, long maxrep, long numpop)
{
    long replicate;
    char *input;
    timearchive_fmt **ta = world->atl;
    input = (char*) mycalloc(LONGLINESIZE*world->numpop2,sizeof(char));
    for (replicate = 0; replicate < maxrep; replicate++)
    {
        unpack_single_sumfile_buffer (input, buffer, ta, world, locus, replicate, numpop);
    }
    free(input);
}



///
/// unpack minimal statistic trees for a single replicate
void
unpack_single_sumfile_buffer (char *input, char *buffer, timearchive_fmt **ta, world_fmt *world,
                              long locus, long replicate, long numpop)
{
    long i, j;
    char *buf = buffer;
    sgets (input, LONGLINESIZE, &buf);
    sscanf (input, "%li %li %li %lg", &ta[replicate][locus].T, &ta[replicate][locus].numpop,
            &ta[replicate][locus].sumtips, &ta[replicate][locus].param_like);
    world->chainlikes[locus][replicate] = ta[replicate][locus].param_like;
    //fprintf(stdout,"%i> receive single sumfile for locus %li and replicate %li\n",myID,locus,replicate);
    increase_timearchive (world, locus, ta[replicate][locus].T, world->numpop, replicate);
    for (i = 0; i < ta[replicate][locus].T; i++)
    {
        sgets (input, LONGLINESIZE, &buf);
        sscanf (input, "%li %lg", &ta[replicate][locus].tl[i].copies, &ta[replicate][locus].tl[i].lcopies);
        for (j = 0; j < 3 * numpop + numpop * (numpop - 1); j++)
        {
            sgets (input, LONGLINESIZE, &buf);
            sscanf (input, "%lg", &ta[replicate][locus].tl[i].data[j]);
        }
    }
    for (i = 0; i < world->numpop2; i++)
    {
        sgets (input, LONGLINESIZE, &buf);
        sscanf (input, "%lg %lg", &ta[replicate][locus].param[i], &ta[replicate][locus].param0[i]);
    }
    log_param0 (ta[replicate][locus].param0, ta[replicate][locus].lparam0, world->numpop2);
    sgets (input, LONGLINESIZE, &buf);
    sscanf (input, "%li %lg", &ta[replicate][locus].trials, &ta[replicate][locus].normd);
    
}


long pack_single_sumfile_buffer(char **buffer, world_fmt * world,
                                long locus, long replicate, long numpop)
{
    long i, j;    
    char *tempbuffer;
    char *tempbuffer2;
    long numpop2 = numpop * numpop;
    long bufsize=1;
    
    timearchive_fmt **ta = world->atl;
    char *input;
    
    input = (char*) mycalloc(LONGLINESIZE*world->numpop2,sizeof(char));
    sprintf (input, "%li %li %li %g \n", ta[replicate][locus].T,
             ta[replicate][locus].numpop, ta[replicate][locus].sumtips, ta[replicate][locus].param_like);
    bufsize += 1000 * (long) strlen (input) + 1;
    (*buffer) = (char *) myrealloc (*buffer, sizeof (char) * bufsize);
    (*buffer)[0] = '\0';
    strcat ((*buffer), input);
    tempbuffer = (char *) mycalloc((100+ 3 * numpop + numpop * (numpop - 1) * 100 + 50 * numpop * numpop + 50),sizeof(char));//100 was 22
    tempbuffer2 = (char *) mycalloc(101* (100+ 3 * numpop + numpop * (numpop - 1) * 100 + 50 * numpop * numpop + 50),sizeof(char));
    //    memset(tempbuffer2,0,sizeof(char)*101*(22+ 3 * numpop + numpop * (numpop - 1) * 22 + 50 * numpop * numpop + 50));
    for (i = 0; i < ta[replicate][locus].T; i++)
    {
        //   memset(tempbuffer,0,sizeof(char)*(22+ 3 * numpop + numpop * (numpop - 1) * 22 + 50 * numpop * numpop + 50));
        tempbuffer[0]='\0';
        sprintf (input, "%li %g\n", ta[replicate][locus].tl[i].copies, ta[replicate][locus].tl[i].lcopies);
        strcat (tempbuffer, input);
        //       printf("%5li %5li\n",i,j);
        for (j = 0; j < 3 * numpop + numpop * (numpop - 1); j++)
        {
            sprintf (input, "%g\n", ta[replicate][locus].tl[i].data[j]);
            strcat (tempbuffer, input);
        }
        strcat(tempbuffer2,tempbuffer);
        if((i+1) % 100 == 0)
        {
            bufsize += (long) strlen(tempbuffer2)+1;
            (*buffer) = (char *) myrealloc (*buffer, sizeof (char) * bufsize);
            strcat(*buffer,tempbuffer2);
            tempbuffer2[0] = '\0';
        }
    }
    strcat (*buffer, tempbuffer2);
    tempbuffer[0]='\0';
    tempbuffer2[0]='\0';
    for (i = 0; i < numpop2; i++)
    {
        sprintf (input, "%g %g\n", ta[replicate][locus].param[i], ta[replicate][locus].param0[i]);
        strcat (tempbuffer, input);
    }
    sprintf (input, "%li %g\n", ta[replicate][locus].trials, ta[replicate][locus].normd);
    strcat (tempbuffer, input);
    //   strcat(tempbuffer2,tempbuffer);
    strcat(*buffer,tempbuffer);
    free(input);
    free(tempbuffer);
    free(tempbuffer2);
    return (long) strlen((*buffer))+1;    
}

long
pack_sumfile_buffer (char **buffer, world_fmt * world,
                     long locus, long maxrep, long numpop)
{
    long replicate; //, i, j;
                    //    long numpop2 = numpop * numpop;
    
    // calc bufsize needed
    long bufsize = 1;
    long allocbufsize = 0;
    
    timearchive_fmt **ta = world->atl;
    
    for (replicate = 0; replicate < maxrep; replicate++)
    {
        allocbufsize += 100 + ta[replicate][locus].T * (22+ 3 * numpop + numpop * (numpop - 1) * 22 + 50 * numpop * numpop + 50);
    }
    (*buffer) = (char *) myrealloc ((*buffer), allocbufsize * sizeof (char));
    (*buffer)[0] = '\0';
    for (replicate =0; replicate < maxrep; replicate++)
    {
        pack_single_sumfile_buffer(buffer, world, locus, replicate, numpop);
    }
    bufsize = (long) strlen(*buffer)+1;
    if(bufsize > allocbufsize)
        error("allocation exceeded in pack_sumfile_buffer");
    return bufsize;
}


///
/// gather results (sumfiles, results, migrate-histogram, ..) from workers
void
mpi_results_master (MYREAL sendtype, world_fmt * world, long maxreplicate,
                    void (*unpack) (char *buffer, world_fmt * world,
                                    long locus, long maxrep, long numpop))
{
    long numpop = world->numpop;
    long bufsize = 1;
    // maxreplicate > 1 ---> add 1 [this all needs careful checking]
    // MIGMPI_SUMFILE -----> 0 
    // MIGMPI_HIST    -----> 0
    // still strange? long addon = (maxreplicate>1) ? 1 : ((sendtype == MIGMPI_SUMFILE) ||  (sendtype == MIGMPI_MIGHIST) )? 0 : ((world->loci == 1) ? 0 : 1) ;
    //long addon = (maxreplicate>1) ? 1 : ((world->loci > 1) ? 1 : 0) ;
    long addon = 1;
    //    boolean done = FALSE;
    char *buffer, *sbuffer=NULL;
    MYREAL *temp;
    int worker;
    long z, tag, sender;
    MPI_Status status;
    long numelem = world->numpop2 + (world->options->gamma ? 1 : 0);
    long numelem2 = 2 * numelem;
#ifdef BIGMEMORY

    long *workload;
    long workerloci;
    long offset;
    long i;
    workload = (long*) mycalloc(world->loci*2+2,sizeof(long));
    //printf("%i> mpi_result_master()\n",myID);
#endif

    temp = (MYREAL *) mycalloc (numelem2, sizeof (MYREAL));
    buffer = (char *) mycalloc (bufsize, sizeof (char));
    temp[0] = sendtype;
    temp[1] = (MYREAL) bufsize;
    for (worker = 1; worker < MIN (world->loci + addon, numcpu); worker++)
    {
        MYMPISEND (temp, numelem2, MPI_DOUBLE, worker, worker, comm_world);
    }
    z = 0;
    while (z < world->loci)
    {
#ifdef BIGMEMORY
        MYMPIRECV (workload, 2*world->loci+2, MPI_LONG, MPI_ANY_SOURCE, MPI_ANY_TAG,
                  comm_world, &status);
        bufsize = workload[2*world->loci] ;
        workerloci = workload[2*world->loci+1];
        buffer = (char *) myrealloc (buffer, sizeof (char) * (bufsize+ 1));
        memset (buffer, 0, sizeof (char) * (bufsize + 1));
        sender = status.MPI_SOURCE;
        tag = status.MPI_TAG;
        MYMPIRECV (buffer, bufsize, MPI_CHAR, sender, tag, comm_world, &status);
        sbuffer = buffer;
        for(i=0; i<workerloci;i++)
        {
            locus = (int) workload[2*i];
            offset = workload[2*i+1];
            //printf("%i> locus=%i offset=%li\n",myID,locus,offset);
            (*unpack) (buffer+offset, world, locus, maxreplicate, numpop);
        }
        z += workerloci-1; //the test condition will add another one
#else
        MYMPIRECV (&bufsize, ONE, MPI_LONG, MPI_ANY_SOURCE, MPI_ANY_TAG,
                  comm_world, &status);
        buffer = (char *) myrealloc (buffer, sizeof (char) * (bufsize + 1));
        memset (buffer, 0, sizeof (char) * (bufsize + 1));
        sender = status.MPI_SOURCE;
        tag = status.MPI_TAG;
#ifdef DEBUG_MPI
        FPRINTF(stdout, "%i> z=%li worker=%i bufsize=%li -------------------------------------\n",myID, z, sender, bufsize);
#endif
        MYMPIRECV (buffer, bufsize, MPI_CHAR, (MYINT) sender, (MYINT) tag, comm_world, &status);
        sbuffer = buffer;
        (*unpack) (buffer, world, tag - 1, maxreplicate, numpop);
#endif
        z++;
    }
    free (sbuffer);
    free (temp);
}

void
mpi_results_worker (long bufs, world_fmt * world, long maxrep,
                    long (*pack) (char **buffer, world_fmt * world,
                                  long locus, long maxrep, long numpop))
{
    long numpop = world->numpop;
    int ww, locus;
    char *allbuffer;
    //char **buf;
    long bufsize = locidone * 20; //enough space to accomodate the locus vector
#ifdef BIGMEMORY

    char *buffer;
    long allbufsize=0;
    long *workload;
    long zz=0;
#endif

    allbuffer = (char *) mycalloc (bufsize, sizeof (char)); //some small value
    //buf = &buffer;
#ifdef BIGMEMORY

    //printf("%i> mpi_result_worker()\n",myID);
    fflush(stdout);
    bufsize = 0;
    workload = (long *) mycalloc(2+world->loci*2, sizeof(long));//{locus1,strlen1,locus2,strlen2,.....,totalstrlen,n_loci}
    for (ww = 0; ww < locidone; ww++)
    {
        buffer = (char *) mycalloc (bufsize+1, sizeof (char));//some small value
        locus = world->who[ww];
        bufsize = (*pack) (&buffer, world, locus, maxrep, numpop);
        bufsize = (long) strlen(buffer);
        workload[zz++] = (long) locus;
        workload[zz++] = allbufsize;
        allbufsize += bufsize;
        //printf("%i> locus=%li size=%li\n",myID,workload[zz-2],workload[zz-1]);
        allbuffer = (char *) myrealloc (allbuffer, sizeof (char)*allbufsize+1);
        strcat(allbuffer,buffer);
        free(buffer);
    }
    workload[2*world->loci]=strlen(allbuffer)+1;
    workload[2*world->loci+1]=(long)locidone;
    MYMPISEND (workload, 2+world->loci*2, MPI_LONG, MASTER, myID, comm_world);
    MYMPISEND (allbuffer, workload[2*world->loci], MPI_CHAR, MASTER, myID, comm_world);
    free(workload);
#else

    for (ww = 0; ww < locidone; ww++)
    {
        allbuffer = (char *) myrealloc (allbuffer, sizeof (char)); //some small value
        //    buf = &allbuffer;
        locus = world->who[ww];
        //fprintf(stdout,"%i> locus=%li = world->who[%li]========================================\n",locus,ww);
        bufsize = (*pack) (&allbuffer, world, locus, maxrep, numpop);
        MYMPISEND (&bufsize, ONE, MPI_LONG, MASTER, locus + 1, comm_world);
        MYMPISEND (allbuffer, bufsize, MPI_CHAR, MASTER, locus + 1, comm_world);
    }

#endif /*BIGMEMORY*/

}

void
mpi_broadcast_results (world_fmt * world, long loci,
                       long (*pack) (char **buffer, world_fmt * world,
                                     long locus, long maxrep, long numpop),
                       void (*unpack) (char *buffer, world_fmt * world,
                                       long locus, long maxrep, long numpop))
{
    long locus;
    // long addon = (world->loci == 1) 0 : 1;
    long bufsize=1;
    char nowstr[STRSIZE];
    char *allbuffer;// = &world->buffer;
#ifdef BIGMEMORY

    long allbufsize=0;
    char *buffer;
    long *workload;
    long workerloci=0;
    long i;
    long offset=0;
    long zz=0;
#endif

    long maxreplicate = (world->options->replicate
                         && world->options->replicatenum >
                         0) ? world->options->replicatenum : 1;
    allbuffer = (char *) mycalloc (10, sizeof (char));
#ifdef BIGMEMORY

    workload = (long *) mycalloc( 2 * (loci) + 2, sizeof(long));
    //  free(allbuffer);
    //  allbuffer = (char *) mycalloc (10, sizeof (char));
#endif

    get_time (nowstr, "%H:%M:%S");
    if(world->options->progress)
    printf("%i> Redistributing the data\nResult parts [Time is %s]\n",myID, nowstr);
#ifdef BIGMEMORY

    if (myID == MASTER)
    {
        for (locus = 0; locus < loci; locus++)
        {
            buffer = (char *) mycalloc (1, sizeof (char));
            bufsize = (*pack)(&buffer, world, locus, maxreplicate,
                              world->numpop);
            bufsize = (long) strlen(buffer);
            workload[zz++] = (long) locus;
            workload[zz++] = allbufsize;
            allbufsize += bufsize;
            //printf("%i> result locus=%li size=%li\n",myID,workload[zz-2],workload[zz-1]);
            allbuffer = (char *) myrealloc ( allbuffer, sizeof (char)*(allbufsize+1));
            strcat(allbuffer,buffer);
            free(buffer);
        }
        workload[2*loci]=allbufsize;
        workload[2*loci+1]=(long)loci;
        // MYMPIBARRIER(comm_world);
        MYMPIBCAST (workload, 2 * (loci) + 2, MPI_LONG, MASTER, comm_world);
        MYMPIBCAST (allbuffer, allbufsize+1, MPI_CHAR, MASTER, comm_world);
        //printf("%i> Locus %li results sent\n",myID, locus);
    }
    else
    { // worker node

        // MYMPIBARRIER(comm_world);
        MYMPIBCAST (workload, 2*(loci)+2, MPI_LONG, MASTER, comm_world);
        bufsize =  workload[2*loci] ;
        workerloci = workload[2*loci+1];
        allbuffer = (char *) myrealloc (allbuffer,
                                      sizeof (char) * (bufsize + 1));
        memset (allbuffer, 0, sizeof (char) * (bufsize + 1));
        MYMPIBCAST (allbuffer, bufsize + 1, MPI_CHAR, MASTER, comm_world);
        for(i=0; i<workerloci; i++)
        {
            locus = (int) workload[2*i];
            offset = workload[2*i+1];
            (*unpack)(allbuffer + offset, world, locus, maxreplicate,
                      world->numpop);
        }
        // printf("%i> Loci results received\n",myID);
    }
#else
    for (locus = 0; locus < loci; locus++)
    {
        if (myID == MASTER)
        {
            bufsize =(*pack) (&allbuffer, world, locus, maxreplicate,
                              world->numpop);
            MYMPIBCAST (&bufsize, 1, MPI_LONG, MASTER, comm_world);
            MYMPIBCAST (allbuffer, bufsize, MPI_CHAR, MASTER, comm_world);
#ifdef DEBUG_MPI
            printf("%i> Locus %li results sent\n",myID, locus);
#endif
        }
        else
        {
            MYMPIBCAST (&bufsize, 1, MPI_LONG, MASTER, comm_world);
            allbuffer = (char *) myrealloc (allbuffer, sizeof (char) * bufsize + 1);
            MYMPIBCAST (allbuffer, bufsize, MPI_CHAR, MASTER, comm_world);
            (*unpack)(allbuffer, world, locus, maxreplicate,
                      world->numpop);
#ifdef DEBUG_MPI
            printf("%i> Locus %li results received\n",myID, locus);
#endif
        }
        memset (allbuffer, 0, sizeof (char) * bufsize);
    }
#endif
    free(allbuffer);
}

// slownet profiler
//
#ifdef SLOWNET

void
mpi_profiles_master (world_fmt * world, long nparam, int *profilewho)
{
    long pnum;
    int sender=0;
    int tag;
    long pdone;
    boolean done;
    MPI_Status status;
    long numsent = 0;
    long bufsize;
    long quantsize;
    long failuresize;
    char *tempstr;
    long tempstrsize=MAXBUFSIZE;
    long temp[4];
    char **buffer = &world->buffer;
    FILE *outfile = world->outfile;
    long minnodes = MIN (nparam, numcpu - 1);
    MPI_Request *irequests;
    MPI_Status *istatus;
    irequests = (MPI_Request *) mycalloc(minnodes,sizeof(MPI_Request));
    istatus = (MPI_Status *) mycalloc(minnodes,sizeof(MPI_Status));
        
    tempstr = (char*) mycalloc(MAXBUFSIZE,sizeof(char));

    for (pnum = 0; pnum < minnodes; pnum++)
    {
        MYMPIISEND (&pnum, 1, MPI_LONG, (MYINT) (pnum + 1), (MYINT) (pnum + 1), comm_world,&irequests[numsent]);
        // fprintf(stdout,"%i>>>>> sent parameter number %li to node %li with tag %li\n",myID,pnum,pnum+1,pnum+1);
        numsent++;
    }
    MYMPIWAITALL(minnodes,irequests, istatus);

    for (pnum = 0; pnum < nparam; pnum++)
    {
        done = FALSE;
        while(!done)
        {
            MYMPIRECV (tempstr, SMALLBUFSIZE, MPI_CHAR, (MYINT) MPI_ANY_SOURCE, (MYINT) MPI_ANY_TAG, comm_world, &status);
            sender = status.MPI_SOURCE;
            tag = status.MPI_TAG;
            switch(tempstr[0])
            {
            case 'M':
                 tempstrsize = atol(tempstr+1);
                MYMPIRECV (tempstr, tempstrsize, MPI_CHAR, (MYINT) sender, (MYINT) tag,
                           comm_world, &status);                
                handle_message(tempstr,sender, world);
                break;
            case 'P':
                MYMPIRECV (temp, FOUR, MPI_LONG, (MYINT) sender, (MYINT) MPI_ANY_TAG,
                           comm_world, &status);
                pdone = temp[0];
                bufsize = temp[1];
                quantsize = temp[2];
                failuresize = temp[3];
                //fprintf(stdout,"%i> ++++++++++++ bufsize=%li quantsize=%li failuresize=%li from sender %i\n",
                //        myID,bufsize,quantsize,failuresize,sender);
                profilewho[pdone] = sender;
                *buffer =
                    (char *) myrealloc (*buffer, sizeof (char) * (bufsize + quantsize + failuresize + 1));
                memset (*buffer, 0, sizeof (char) * (bufsize + quantsize + failuresize + 1));
                MYMPIRECV (*buffer, bufsize + quantsize + failuresize, MPI_CHAR, (MYINT) sender, (MYINT) tag,
                          comm_world, &status);
                //fprintf(stdout,"@%s@\n\n@%s@\n\n\n",*buffer+bufsize,*buffer+bufsize+quantsize);
                //fprintf(stdout,"########################\n# %i> buf %li from %i \n########################\n",
                //myID, (long) (long) strlen(*buffer)+1, sender);
                if(world->options->printprofsummary)
                {
                    unpack_quantile ((*buffer) + bufsize, world->quantiles[pdone],
                                     GRIDSIZE);
                    unpack_failed_percentiles ((*buffer) + bufsize + quantsize, world->percentile_failed[pdone],
                                     GRIDSIZE);
//                    fprintf(stdout,"%i>failed=%i %i %i %i %i %i %i\n",myID,world->percentile_failed[pdone][0]
  //                          ,world->percentile_failed[pdone][1]
    //                        ,world->percentile_failed[pdone][2]
      //                      ,world->percentile_failed[pdone][3]
        //                    ,world->percentile_failed[pdone][4]
          //                  ,world->percentile_failed[pdone][5]
            //                ,world->percentile_failed[pdone][6]);
                    memset ((*buffer) + bufsize , 0, sizeof (char) * (quantsize + failuresize));
                }
                // print profile table that is in the first part of the buffer
                fprintf (outfile, "%s\n\n", *buffer);
                done=TRUE;
                break;
            default:
                fprintf(stderr,"%i> message=%s\n%i> sender=%i tag=%i\n",myID,tempstr, myID,status.MPI_SOURCE,status.MPI_TAG);
                MPI_Finalize();
                error("DIED because of wrong message from worker");
                break;
            }
        }
        if (numsent < nparam)
        {
            MYMPISEND (&numsent, ONE, MPI_LONG, (MYINT) sender, (MYINT) (numsent + 1), comm_world);
            numsent++;
        }
        else
        {
            // stop worker because there is nothing to do anymore
            MYMPISEND (&nparam, ONE, MPI_LONG, sender, 0, comm_world); //end of parameter list
        }
    }
    // stop workers that did nothing for profiles
    for (sender = MIN (nparam, numcpu - 1) + 1; sender < numcpu ; sender++)
    {
        // stop all nodes to wait for profiles
        MYMPISEND (&nparam, ONE, MPI_LONG, sender, 0, comm_world); 
    }
    free(istatus);
    free(irequests);
}

void
mpi_profiles_worker (world_fmt * world, long *gmaxptr)
{
    boolean done = FALSE;
    long pnum;
    long temp[4];
    char *tempstr;
    long tempstrsize;
    char *quantilebuffer;
    char *failedbuffer;
    MPI_Status status;
    quantilebuffer = (char *) mycalloc (ONE, sizeof (char));
    failedbuffer = (char *) mycalloc (ONE, sizeof (char));
    tempstr = (char *) mycalloc (MAXBUFSIZE, sizeof (char));
    while (!done)
    {
        //fprintf(stdout,"%i> before receive of parameter number\n",myID);
        MYMPIRECV (&pnum, 1, MPI_LONG, (MYINT) MASTER, (MYINT) MPI_ANY_TAG, comm_world, &status);
        //fprintf(stdout,"%i> RECEIVED parameter number %li from %i with tag %i\n",myID,pnum,status.MPI_SOURCE,status.MPI_TAG);
                
        if (status.MPI_TAG != 0) //stop condition
        {
            // fills world->buffer with profile information
            print_profile_likelihood_driver (pnum, world, gmaxptr);
            temp[0] = pnum;
            temp[1] = (long) strlen (world->buffer);
            if(world->options->printprofsummary)
            {
                temp[2] = pack_quantile (&quantilebuffer, world->quantiles[pnum], GRIDSIZE);
                world->buffer =     (char *) myrealloc (world->buffer,
                                                        sizeof (char) * (temp[1] + temp[2] + 1));
                strcat (world->buffer, quantilebuffer);
                temp[3] = pack_failed_percentiles (&failedbuffer, world->percentile_failed[pnum], GRIDSIZE);
                world->buffer =     (char *) myrealloc (world->buffer,
                                                        sizeof (char) * (temp[1] + temp[2] + temp[3] + 1));
                strcat (world->buffer, failedbuffer);
            }
            else
            {
                temp[2] = 0;
                temp[3] = 0;
            }
            tempstrsize = 1 + sprintf(tempstr,"P%li", temp[1]);
            tempstr[0]='P';
            MYMPISEND (tempstr, SMALLBUFSIZE , MPI_CHAR, (MYINT) MASTER, (MYINT) pnum + 1, comm_world);
            MYMPISEND (temp, FOUR, MPI_LONG, (MYINT) MASTER, (MYINT) pnum + 1, comm_world);
            MYMPISEND (world->buffer, temp[1] + temp[2] + temp[3], MPI_CHAR, (MYINT) MASTER, (MYINT) 
                      pnum + 1, comm_world);
            world->profilewho[profiledone++] = pnum;
        }
        else
        {
            done = TRUE;
        }
    }
    free(tempstr);
    free (quantilebuffer);
    free(failedbuffer);
}

long
pack_quantile (char **buffer, quantile_fmt quant, long n)
{
    long i;
    char fp[LONGLINESIZE];
    long bufsize = LINESIZE;
    *buffer = (char *) myrealloc (*buffer, sizeof (char) * bufsize);
    sprintf (*buffer, "QUANTILEBUFFER:\n %s\n", quant.name);
    for (i = 0; i < n; i++)
    {
        bufsize += 1 + sprintf (fp, "%20.20f\n", quant.param[i]);
        *buffer = (char *) myrealloc (*buffer, sizeof (char) * bufsize);
        strcat (*buffer, fp);
    }
    bufsize = (long) strlen(*buffer);
    return bufsize;
}

void
unpack_quantile (char *buffer, quantile_fmt quant, long n)
{
    long i;
    char *input;
    char *buf = buffer;
    input = (char*) mycalloc(LONGLINESIZE,sizeof(char));
    sgets (input, LONGLINESIZE, &buf);
    sgets (input, LONGLINESIZE, &buf);
    strcpy (quant.name, input);
    for (i = 0; i < n; i++)
    {
        sgets (input, LONGLINESIZE, &buf);
        quant.param[i] = atof (input);
    }
    free(input);
}

/// 
/// pack notice of failure of convergence to the profile likelihood precentiles
/// this assume that n is never bigger than LONGLINESIZE, a safe assumption
/// n is the number of grid points in the profile calculation, currently set to 9
/// (May 19 2004), changing this number will cause large ripple effects. but see
/// under profile_max_precentile()
long
pack_failed_percentiles (char **buffer, boolean *failed, long n)
{
    long i;
    char fp[LONGLINESIZE];
    long bufsize = n + ONE;
    *buffer = (char *) myrealloc (*buffer, sizeof (char) * bufsize);
    memset(*buffer,0,sizeof(char)*bufsize);
    for (i = 0; i < n; i++)
        fp[i] =  failed[i] ? '1' : '0' ;
    fp[i]='\0';
    strcat (*buffer, fp);
    return bufsize;
}

/// 
/// unpack notice of failure of convergence to the profile likelihood precentiles
/// this assume that n is never bigger than LONGLINESIZE, a safe assumption
void
unpack_failed_percentiles (char *buffer, boolean *failed, long n)
{
    long i;
    char *input;
    char *buf = buffer;
    input = (char*) mycalloc(LONGLINESIZE,sizeof(char));
    sgets (input, LONGLINESIZE, &buf);
    //fprintf(stdout,"@%s@\n",input);
    for (i = 0; i < n; i++)
    {
        failed[i] = (input[i] == '1');
    //    fprintf(stdout,"@%i\n",(int) failed[i]);
    }
    free(input);
}

#endif

/*
// send the data over all loci/replicates to all nodes
// including the master node, so that all nodes can then 
// start calculating profiles [see calc_profiles()]
//
void distribute_locidata(world_fmt *world)
{
  char *buffer;
  pack_loci_data(world, &buffer);
  MPI_allgather(buffer);
  unpack_loci_data(buffer, world);
  free(buffer);
}
 
void pack_loci_data(world_fmt *world, char **buffer)
{
  long replicates = world->options->repl
  *buffer = myrealloc(*buffer,LONGLINESIZE);
  hits = sscanf (input, "%li %li %li %li %li", &world->loci, &world->numpop, &world->numpop2, &tmp, &replicates);  
}
*/
// necessary for analyzing old sumfiles using MPI
//
// master is reusing  mpi_runloci_master()
void
assignloci_worker (world_fmt * world)
{
    boolean done = FALSE;
    long locus;
    MPI_Status status;
    long * twolongs;
    twolongs = (long *) calloc(TWO,sizeof(long));
    while (!done)
    {
        MYMPIRECV (twolongs, TWO, MPI_LONG, (MYINT) MASTER, (MYINT) MPI_ANY_TAG,
                  comm_world, &status);
        locus = twolongs[0];
        if (status.MPI_TAG != 0) //stop condition
        {
            swap_atl (locus, locidone, world);

            MYMPISEND (&locus, ONE, MPI_LONG, (MYINT) MASTER, (MYINT) locus + 1, comm_world);
            /* we want to know what locus we worked for
               - to control the work sent by master
               - to use in setup_parameter0() [combroyden2.c] */
            world->who[locidone++] = locus;
        }
        else
        {
            done = TRUE;
        }
    }
}

void
swap_atl (long from, long to, world_fmt * world)
{
    long r;
    timearchive_fmt *tmp;
    for (r = 0; r < world->options->replicatenum; r++)
    {
        tmp = &world->atl[r][to];
        world->atl[r][to] = world->atl[r][from];
        world->atl[r][from] = *tmp;
    }
}


#ifdef SLOWNET
void
setup_parameter0_slowmpi (world_fmt * world, nr_fmt * nr, long repkind,
                          long repstart, long repstop, long loci, long kind,
                          boolean multilocus)
{
    long locus, r;
    if (myID != MASTER)
    {
        if (multilocus)
        {
            for (locus = 0; locus < loci; locus++)
            {
                if (repkind == SINGLECHAIN)
                {
                    for (r = repstart; r < repstop; r++)
                        create_apg0 (nr->apg0[r][locus], nr,
                                     &world->atl[r][locus], locus);
                }
                else
                {
//                    if (kind != PROFILE)
//                    {
                        for (r = repstart; r < repstop; r++)
                            create_apg0 (nr->apg0[r][locus], nr,
                                         &world->atl[r][locus], locus);
                        interpolate_like (nr, locus);
//                    }
//                    else
//                    {
                        for (r = repstart; r < repstop; r++)
                            create_multiapg0 (nr->apg0[r][locus], nr, r, locus);
//                    }
                }
            }
        }
        else   //single locus
        {
            if (repkind == SINGLECHAIN)
            {
                for (r = repstart; r < repstop; r++)
                    create_apg0 (nr->apg0[r][world->locus], nr,
                                 &world->atl[r][world->locus], world->locus);
            }
            else
            {
//                if (kind != PROFILE)
//                {
                    for (r = repstart; r < repstop; r++)
                        create_apg0 (nr->apg0[r][world->locus], nr,
                                     &world->atl[r][world->locus], world->locus);
                    interpolate_like (nr, world->locus);
//                }
                for (r = repstart; r < repstop; r++)
                    create_multiapg0 (nr->apg0[r][world->locus], nr, r,
                                      world->locus);
            }
        }
    }
}
#endif

void
setup_parameter0_mpi (world_fmt * world, nr_fmt * nr, long repkind,
                      long repstart, long repstop, long loci, long kind,
                      boolean multilocus)
{
    long locus, r;
    long ll;
    if (myID != MASTER)
    {
        if (multilocus)
        {
            for (ll = 0; ll < locidone; ll++)
            {
                locus = world->locus = world->who[ll];
                if (repkind == SINGLECHAIN)
                {
                    for (r = repstart; r < repstop; r++)
                        create_apg0 (nr->apg0[r][locus], nr,
                                     &world->atl[r][locus], locus);
                }
                else
                {
//                    if (kind != PROFILE)
//                    {
                        for (r = repstart; r < repstop; r++)
                            create_apg0 (nr->apg0[r][locus], nr,
                                         &world->atl[r][locus], locus);
                        interpolate_like (nr, locus);
//                    }
//                    else
//                    {
                        for (r = repstart; r < repstop; r++)
                            create_multiapg0 (nr->apg0[r][locus], nr, r, locus);
//                    }
                }
            }
        }
        else   //single locus
        {
            if (repkind == SINGLECHAIN)
            {
                for (r = repstart; r < repstop; r++)
                    create_apg0 (nr->apg0[r][world->locus], nr,
                                 &world->atl[r][world->locus], world->locus);
            }
            else
            {
//                if (kind != PROFILE)
//                {
                    for (r = repstart; r < repstop; r++)
                        create_apg0 (nr->apg0[r][world->locus], nr,
                                     &world->atl[r][world->locus], world->locus);
                    interpolate_like (nr, world->locus);
//                }
                for (r = repstart; r < repstop; r++)
                    create_multiapg0 (nr->apg0[r][world->locus], nr, r,
                                      world->locus);
            }
        }
    }
}


void handle_message(char *rawmessage,int sender, world_fmt * world)
{
    char *rawptr;
    long  pos=0;
    FILE *file = stdout;
    rawptr = rawmessage;
    set_filehandle(rawmessage, world, &file, &pos);
    fprintf(file,"%s", rawmessage + pos);
    fflush(file);
}

// needs globals filedb, and filenum
void setup_filehandle_db(FILE *file, world_fmt *world, option_fmt *options, data_fmt *data)
{
    long filehandle = get_filehandle(file, world, options, data);
    filedb[filenum].file = file;
    filedb[filenum++].handle = filehandle;
    fprintf(stdout,"filedb %li: %p %li\n",filenum, file,filehandle);
}

long retrieve_filehandle(FILE *file)
{
    long i=0;
    long filehandle = 0;
    while(filedb[i].file != file && i<filenum)
        i++;
    if(i!=filenum)
        filehandle = filedb[i].handle;
    return filehandle;
}

long get_filehandle(FILE *file, world_fmt *world, option_fmt *options, data_fmt *data)
{
    if(file == stdout)
        return STDOUTNUM;
    if(file == options->logfile)
        return LOGFILENUM;
    if(file == world->outfile)
        return OUTFILENUM;
    if(file == options->aicfile)
        return AICFILENUM;
    if(file == world->mathfile)
        return MATHFILENUM;
    if(file == world->mighistfile)
        return MIGHISTFILENUM;
    if(file == world->bayesfile)
        return BAYESFILENUM;
    if(file == world->bayesmdimfile)
        return BAYESMDIMFILENUM;
    if(file == world->pdfoutfile)
        return PDFOUTFILENUM;
    return STDOUTNUM;
}

long get_filehandle2(FILE *file, world_fmt *world)
{
    if(file == stdout)
        return STDOUTNUM;
    if(file == world->options->logfile)
        return LOGFILENUM;
    if(file == world->outfile)
        return OUTFILENUM;
    if(file == world->mighistfile)
        return MIGHISTFILENUM;
    if(file == world->options->aicfile)
        return AICFILENUM;
    if(file == world->mathfile)
        return MATHFILENUM;
    if(file == world->bayesfile)
        return BAYESFILENUM;
    if(file == world->bayesmdimfile)
        return BAYESMDIMFILENUM;
 //   fprintf(stdout,"@@@@@@@@@@@@@@@wrong wrong wrong@@@@@@@@@@@@@@@@\n");
    return STDOUTNUM;
}


void set_filehandle(char *message, world_fmt *world,
                    FILE **file, long *msgstart)
{
    long filepos=0;
    static char *temp;
    long filenum;
    temp = (char *) mycalloc(100,sizeof(char));
    filepos = strcspn(message,":") + 1;
    *msgstart = filepos;
    strncpy(temp,message,filepos);
    filenum = atol(temp+1);
    //fprintf(stdout,"\n@@@@@@@@@@@@@@@@@%li@%s@%li@\n",filenum,temp,filepos);
    switch(filenum)
      {
      case STDOUTNUM:
	{
	  //		fprintf(stdout,"\n");
	  *file = stdout;
	  return;
	}
      case LOGFILENUM:
	{
	  //	fprintf(stdout," logfile\n");
	  *file = world->options->logfile;
	  return;
	}
      case OUTFILENUM:
	{
	  *file = world->outfile;
	  return ;
	}
      case AICFILENUM:
	{
	  *file = world->options->aicfile;
	  return ;
	}
      case MATHFILENUM:
	{
	  *file = world->mathfile;
	  return ;
	}
      case MIGHISTFILENUM:
	{
	  *file = world->mighistfile;
	  return ;
	}
      case BAYESFILENUM:
	{
	  *file = world->bayesfile;
	  return ;
	}
      case BAYESMDIMFILENUM:
	{
	  *file = world->bayesmdimfile;
	  return ;
	}
      case PDFOUTFILENUM:
	{
	  *file = world->pdfoutfile;
	  return ;
	}
      }
    *file = stdout;
    return;
}

void
mpi_fprintf(FILE *file, const char *fmt, ...)
{
    char *p1;
    char *p;
    va_list ap;
    long filehandle = 0;
    long bufsize = 0;
    p = (char *) mycalloc(MAXBUFSIZE,sizeof(char));
    p1 = (char *) mycalloc(SMALLBUFSIZE,sizeof(char));
    if(myID!=MASTER)
    {
        filehandle = retrieve_filehandle(file);
        bufsize += sprintf(p, "%c%li:",'M',filehandle);
    }
    va_start(ap, fmt);
    bufsize += 1+ vsprintf(p+bufsize, fmt, ap);
    if(myID!=MASTER)
    {
        bufsize= 1+strlen(p);
        sprintf(p1,"M%li",bufsize);
        MYMPISEND (p1, SMALLBUFSIZE, MPI_CHAR, (MYINT) MASTER, (MYINT) myID+PRINTTAG, comm_world);
        MYMPISEND (p, bufsize, MPI_CHAR, (MYINT) MASTER, (MYINT) myID+PRINTTAG, comm_world);
    }
    else
        fprintf(file,"%s", p);
    va_end(ap);
   // fprintf(stderr,"%s\n", p);
    free(p);
    free(p1);
}


///
/// assembles the data from a replicator
/// 
void 
mpi_receive_replicate(int sender, int tag, long locus, long replicate, world_fmt * world)
{
    char *buffer;
    char *mighistbuffer;
    char *sbuffer;
    char *input;
    
//    MYINT sender;
//    MYINT tag;
    long  bufsize=1;
    
    MPI_Status status;
    
    input = (char*) mycalloc(LONGLINESIZE*world->numpop2,sizeof(char));
    MYMPIRECV (&bufsize, ONE, MPI_LONG, (MYINT) sender, (MYINT) tag, comm_workers, &status);
    //fprintf(stdout,"%i> mpi_receive_replicate received bufsize=%li from sender=%i with tag=%i\n",myID, bufsize, status.MPI_SOURCE, status.MPI_TAG);    
    buffer = (char *) mycalloc (bufsize, sizeof (char));
    //sender = status.MPI_SOURCE;
    //tag = status.MPI_TAG;
    //fprintf(stdout,"%i> mpi_receive_replicate received bufsize=%li from sender=%i with tag=%i\n",myID, bufsize, sender, tag);    
    MYMPIRECV (buffer, bufsize, MPI_CHAR, (MYINT) sender, (MYINT) tag, comm_workers, &status);
    //fprintf(stdout,"%i> received bufsize is really %li and bufsize=%li\n",myID,(long) (long) strlen(buffer),bufsize);
    sbuffer = buffer;
    if(world->options->bayes_infer)
    {
        unpack_single_bayes_buffer(buffer,world->bayes,world,locus);
    }
    else
    {
        unpack_single_sumfile_buffer(input, buffer, world->atl, world, locus, replicate, world->numpop);
    }
    if(world->options->mighist)
    {
        mighistbuffer = strchr(buffer,'@') + 1;
  //      fprintf(stdout,"%i> @@@@%s@@@@\n,",myID, mighistbuffer);
        unpack_mighist_replicate_buffer(input, mighistbuffer, world, locus, world->numpop);
    }
    //fprintf(stdout,"%i> successfully unpacked buffer with bufsize=%li from sender=%i with tag=%i\n",myID, bufsize, sender, tag);    
    free (sbuffer);
    free(input);
}

///
/// replicant sends data to sub-master
/// 
void 
mpi_send_replicate(int sender, long locus,  long replicate, world_fmt * world)
{
    long allocbufsize;
    long bufsize;
    char *mighistbuffer;
    long numpop = world->numpop;
    char *buffer;
    timearchive_fmt **ta = world->atl;

    if(world->options->bayes_infer)
    {
        allocbufsize = world->bayes->numparams * (world->numpop2+1);
        buffer = (char *) mycalloc (allocbufsize, sizeof (char));    
        bufsize = pack_single_bayes_buffer(&buffer,world->bayes,world,locus);
    }
    else
    {
        allocbufsize = 100 + ta[replicate][locus].T * (22+ 3 * numpop + numpop * (numpop - 1) * 22 + 50 * numpop * numpop + 50);
        buffer = (char *) mycalloc (allocbufsize, sizeof (char));    
        bufsize = pack_single_sumfile_buffer(&buffer, world, locus, replicate, world->numpop);
    }
    if(world->options->mighist)
    {
        mighistbuffer = (char *) mycalloc (ONE, sizeof (char));
        bufsize += pack_mighist_buffer(&mighistbuffer, world, locus, -1, numpop) + STRSIZE;
        buffer = (char *) myrealloc (buffer, bufsize * sizeof (char));
        strcat(buffer,"@");
        strcat(buffer,mighistbuffer);
    }
    bufsize = (long) strlen(buffer)+1;
    //fprintf(stdout,"%i> xxxxxx send %li bytes (buffer (long) strlen = %li) to sender=%i with replicate=%li\n",myID, bufsize, (long) (long) strlen(buffer), sender, replicate);    
    MYMPISEND (&bufsize, ONE, MPI_LONG, (MYINT) sender, (MYINT) (locus+1+ REPTAG), comm_workers);
    //fprintf(stdout,"%i> xxxxxx stringlen is really %li and bufsize=%li\n",myID,(long) (long) strlen(buffer),bufsize*sizeof(char));
    MYMPISEND (buffer, bufsize, MPI_CHAR, (MYINT) sender, (MYINT) (locus+1 + REPTAG), comm_workers);
    
    free(buffer);
    
    
}


#if 0
///
/// mpi-heating first version
///
/// \brief updates all trees, controls updates and heating
///
/// controls updates and temperatures (threaded or unthreaded)
/// updates the tree and/or parameters
/// \callgraph
void mpi_run_updates(world_fmt ** universe,
                 int usize,
                 option_fmt * options,
                 long inc,
                 long increment,
                 long step,
                 long steps)
{
    long ii;
    long receiver;
    world_fmt * world = universe[0];
    long  * twolongs;
    
    long maxreplicate = (world->options->replicate
                         && world->options->replicatenum >
                         0) ? world->options->replicatenum : 1;    
    long first_free_node = universe[0]->loci * maxreplicate + 1;
    long free_nodes = numcpu - first_free_node;
    MPI_Request *irequests;
    MPI_Status *istatus;
    irequests = (MPI_Request *) mycalloc(options->heated_chains,sizeof(MPI_Request));
    istatus = (MPI_Status *) mycalloc(options->heated_chains,sizeof(MPI_Status));
    twolongs = (long *) calloc(TWO,sizeof(long));
    
    if (options->heating)
    {
        for (ii = 0; ii < options->heated_chains; ii++)
        {
                MYMPIISEND (twolongs, TWO, MPI_LONG, ii, (MYINT) universe[0]->locus + 1 + TEMPTAG, comm_world, &irequests[receiver]);
        }
        run_one_update(EARTH);
        MYMPIWAITALL(options->heated_chains, irequests,istatus);
        
        heated_swap(universe, EARTH->options);
        if(options->adaptiveheat)
            adjust_temperatures(universe, options->heated_chains, inc/*rement*/+step*increment, steps*increment);        
    }
    else /* no heating*/
    {
        run_one_update(EARTH);
    }
#ifdef UEP
    if (options->uep && EARTH->in_last_chain)
        update_uepanc(EARTH);
#endif /*UEP*/
}
#endif /* 0 this stuff is not used*/

#endif /* MPI */
