/* MPI parts for migrate 
   started November 2000, Seattle
   Peter Beerli beerli@genetics.washington.edu

   
Copyright 2001 Peter Beerli and Joseph Felsenstein

$Id: migrate_mpi.c,v 1.11 2001/09/07 23:56:13 beerli Exp $
*/
#ifdef MPI
#include "migrate_mpi.h"
#include "broyden.h"
#include "combroyden.h"
#include "gammalike.h"
#include "profile.h"
#include "options.h"
#include "world.h"
#include "joint-chains.h"
/*should go into profile.h*/
#define GRIDSIZE 9

extern void run_locus (world_fmt ** universe, int usize,
		       option_fmt * options, data_fmt * data,
		       tpool_t * heating_pool, long maxreplicate,
		       long locus, long *treefilepos, long *Gmax);

long pack_databuffer (char **buffer, data_fmt * data, option_fmt * options);
void mpi_gradient_master (nr_fmt * nr, world_fmt * world, int *who);
void mpi_results_master (double sendtype, world_fmt * world,
			 long maxrep,
			 void (*unpack) (char *buffer, world_fmt * world,
					 long locus, long maxrep,
					 long numpop));

void mpi_results_worker (long bufs, world_fmt * world,
			 long maxrep,
			 long (*pack) (char **buffer, world_fmt * world,
				       long locus, long maxrep, long numpop));
void assignloci_worker (world_fmt * world);
void swap_atl (long from, long to, world_fmt * world);
long pack_quantile(char **buffer, quantile_fmt quant, long n);
void unpack_quantile(char *buffer, quantile_fmt quant, long n);

void
mpi_runloci_master (long loci, int *who)
{
  long locus;
  int sender;
  long locusdone;
  MPI_Status status;
  long numsent = 0;
  for (locus = 0; locus < MIN (loci, numcpu - 1); locus++)
    {
      MPI_Send (&locus, 1, MPI_LONG, locus + 1, locus + 1, comm_world);
      numsent++;
    }
  for (locus = 0; locus < loci; locus++)
    {
      MPI_Recv (&locusdone, 1, MPI_LONG, MPI_ANY_SOURCE, MPI_ANY_TAG,
		comm_world, &status);
      sender = status.MPI_SOURCE;
      who[locusdone] = sender;
      if (numsent < loci)
	{
	  MPI_Send (&numsent, 1, MPI_LONG, sender, numsent + 1,
		    comm_world);
	  numsent++;
	}
      else
	{
	  MPI_Send (&loci, 1, MPI_LONG, sender, 0, comm_world);	//stop worker do wait for new loci
	}
    }
    for(sender=MIN (loci, numcpu - 1)+1; sender < numcpu; sender++)
	{
		MPI_Send (&loci, 1, MPI_LONG, sender, 0, comm_world);	//stop worker do wait for new loci
	}
}


void
mpi_runloci_worker (world_fmt ** universe, int usize,
		    option_fmt * options, data_fmt * data,
		    tpool_t * heating_pool, long maxreplicate,
		    long *treefilepos, long *Gmax)
{
  boolean done = FALSE;
  long locus;
  MPI_Status status;
  while (!done)
    {
      MPI_Recv (&locus, 1, MPI_LONG, MASTER, MPI_ANY_TAG,
		comm_world, &status);
      if (status.MPI_TAG != 0)	//stop condition
	{
	  run_locus (universe, usize, options, data,
		     heating_pool, maxreplicate, locus, treefilepos, Gmax);
	  MPI_Send (&locus, 1, MPI_LONG, MASTER, locus + 1, comm_world);
	  /* we want to know what locus we worked for
	     - to control the work sent by master
	     - to use in setup_parameter0() [combroyden2.c] */
	  universe[0]->who[locidone++] = locus;
	}
      else
	{
	  done = TRUE;
	}
    }
}

double
mpi_likelihood_master (double *param, double *lparam,
		       world_fmt * world, nr_fmt * nr,
		       helper_fmt * helper, int *who)
{
  long locus, worker;
  long sender;
  MPI_Status status;
  double logres = 0.0;
  double *temp;
  double *tmp;
  int tag;

  long numelem = world->numpop2 + (world->options->gamma ? 2 : 1);
  long numelem2 = numelem * 2;
  tmp = (double *) calloc (world->loci, sizeof (double));
  temp = (double *) calloc (numelem2, sizeof (double));
  temp[0] = MIGMPI_LIKE;
  memcpy (temp + 1, param, (numelem - 1) * sizeof (double));
  memcpy (temp + numelem, lparam, (numelem - 1) * sizeof (double));
  memset (nr->locilikes, 0, sizeof (double) * world->loci);
  for (worker = 1; worker < MIN(world->loci+1,numcpu); worker++)
    {
      MPI_Send (temp, (int) numelem2, MPI_DOUBLE, worker,
		worker, comm_world);
    }
  for (worker = 1; worker < MIN(world->loci+1,numcpu); worker++)
    {
      MPI_Recv (tmp, (int) world->loci, MPI_DOUBLE, MPI_ANY_SOURCE,
		MPI_ANY_TAG, comm_world, &status);
      sender = status.MPI_SOURCE;
      tag = status.MPI_TAG;
      // the worker send a vector of values 
      // e.g. (0,0,0,-12,0,-34,0,0,0), these are all loci
      // of which most of them were not evaluated
      // t he loop updates the master copy of locilikes
      for (locus = 0; locus < world->loci; locus++)
	{
	  nr->locilikes[locus] += tmp[locus];
	}
    }
  for (locus = 0; locus < world->loci; locus++)
    {
      logres += nr->locilikes[locus];
    }
  nr->llike = logres;
  free (temp);
  return logres;
}


void
mpi_likelihood_worker (world_fmt * world, helper_fmt * helper, long rep)
{
  long locus, ww;
  nr_fmt *nr = helper->nr;
  memset (nr->locilikes, 0, sizeof (double) * world->loci);
  for (ww = 0; ww < locidone; ww++)
    {
      locus = nr->world->who[ww];
      if (!world->options->gamma)
	{
	  nr->locilikes[locus] = calc_locus_like (nr, nr->param, nr->lparam, locus);
	}
      else
	{
	  helper->locus = locus;
	  helper->nr->locilikes[locus] = gamma_locus_like (nr,
							   helper->expxv,
							   helper->xv, helper->weight, locus);
	}
    }
}


void
mpi_gmax_master (world_fmt * world, long *Gmax)
{
  long sender;
  MPI_Status status;
  long tmp;
  int tag;
  int numreceived = 0;
  *Gmax = 0.;
  MPI_Bcast (Gmax, 1, MPI_LONG, MASTER, comm_world);
  while (numreceived < numcpu - 1)
    {
      MPI_Recv (&tmp, 1, MPI_LONG, MPI_ANY_SOURCE,
		MPI_ANY_TAG, comm_world, &status);
      sender = status.MPI_SOURCE;
      tag = status.MPI_TAG;
      if (*Gmax < tmp)
	*Gmax = tmp;
      numreceived++;
    }
  MPI_Bcast (Gmax, 1, MPI_LONG, MASTER, comm_world);
}

void
mpi_gmax_worker (world_fmt * world)
{
  long ww;
  long repstart;
  long repstop;
  long r;
  long locus;
  long Gmax = 1;
  MPI_Bcast (&Gmax, 1, MPI_LONG, MASTER, comm_world);
  set_replicates (world, world->repkind, world->options->replicatenum,
		  &repstart, &repstop);

  for (ww = 0; ww < locidone; ww++)
    {
      locus = world->who[ww];
      for (r = repstart; r < repstop; r++)
	{
	  if (Gmax < world->atl[r][locus].T)
	    Gmax = world->atl[r][locus].T;
	}
    }
  MPI_Send (&Gmax, 1, MPI_LONG, MASTER, myID, comm_world);
}


void
mpi_send_stop (world_fmt * world)
{
  long worker;
  double *temp;
  long numelem = world->numpop2 + (world->options->gamma ? 2 : 1);
  long numelem2 = 2 * numelem;
  temp = (double *) calloc (numelem2, sizeof (double));
  temp[0] = MIGMPI_END;
  for (worker = 1; worker < numcpu; worker++)
    {
      MPI_Send (temp, (int) numelem2, MPI_DOUBLE, worker, 0, comm_world);	//end of loci
    }
  free (temp);
}

void
mpi_send_stop_tag (int worker, world_fmt * world)
{
  double *temp;
  long numelem = world->numpop2 + (world->options->gamma ? 2 : 1);
  long numelem2 = 2 * numelem;
  temp = (double *) calloc (numelem2, sizeof (double));
  temp[0] = MIGMPI_END;
  MPI_Send (temp, (int) numelem2, MPI_DOUBLE, worker, 0, comm_world);	//end of loci
  free (temp);
}

void
mpi_results_stop (void)
{
  long worker;
  long dummy = 0;
  for (worker = 1; worker < numcpu; worker++)
    {
      MPI_Send (&dummy, 1, MPI_LONG, worker, 0, comm_world);
    }
}

void
mpi_gradient_master (nr_fmt * nr, world_fmt * world, int *who)
{
  long locus;
  long sender;
  MPI_Status status;
  int tag;

  double *temp;
  long *tempindex;
  long numelem = world->numpop2 + (world->options->gamma ? 2 : 1);
  long numelem2 = 2 * numelem;
  temp = (double *) calloc (numelem2, sizeof (double));
  tempindex = (long *) calloc (numelem, sizeof (long));
  temp[0] = MIGMPI_GRADIENT;
  memcpy (temp + 1, nr->param, (numelem - 1) * sizeof (double));
  memcpy (tempindex, nr->indeks, (numelem - 1) * sizeof (long));
  memcpy (temp + numelem, nr->lparam, (numelem - 1) * sizeof (double));
  temp[numelem2 - 1] = nr->profilenum;
  for (locus = 1; locus < MIN(world->loci+1,numcpu); locus++)
    {
      MPI_Send (temp, (int) numelem2, MPI_DOUBLE, locus,
		locus, comm_world);
      MPI_Send (tempindex, (int) numelem, MPI_LONG, locus,
		locus, comm_world);
    }
  memset (nr->d, 0, sizeof (double) * (numelem - 1));
  for (locus = 1; locus <  MIN(world->loci+1,numcpu); locus++)
    {
      copy_and_clear_d (nr);
      MPI_Recv (nr->d, (int) (numelem - 1), MPI_DOUBLE, MPI_ANY_SOURCE,
		MPI_ANY_TAG, comm_world, &status);
      add_back_d (nr);
      sender = status.MPI_SOURCE;
      tag = status.MPI_TAG;
    }
  free (temp);
}

void
mpi_gradient_worker (helper_fmt * helper, nr_fmt * nr,
		     timearchive_fmt ** tyme)
{
  long ww, locus;
  memset (nr->d, 0, sizeof (double) * nr->partsize);
  for (ww = 0; ww < locidone; ww++)
    {
      locus = nr->world->who[ww];
      copy_and_clear_d (nr);
      simple_loci_derivatives (nr->d, nr, tyme, locus);
      add_back_d (nr);
    }
}

void
mpi_maximize_worker (world_fmt * world, long rep)
{
  boolean done = FALSE;
  long locus;
  MPI_Status status;
  nr_fmt *nr;
  helper_fmt helper;
  long repstart, repstop, Gmax;
  long numelem = world->numpop2 + (world->options->gamma ? 2 : 1);
  long numelem2 = numelem * 2;
  double *temp;
  temp = (double *) calloc (numelem2, sizeof (double));
  nr = (nr_fmt *) calloc (1, sizeof (nr_fmt));
  set_replicates (world, world->repkind, rep, &repstart, &repstop);

  which_calc_like (world->repkind);
  MPI_Bcast (&Gmax, 1, MPI_LONG, MASTER, comm_world);
  create_nr (nr, world, Gmax, 0, world->loci, world->repkind, repstart);
  SETUPPARAM0 (world, nr, world->repkind,
		    repstart, repstop, world->loci, MULTILOCUS, TRUE);
  while (!done)
    {
      MPI_Recv (temp, (int) numelem2, MPI_DOUBLE, MASTER, MPI_ANY_TAG,
		comm_world, &status);
      locus = world->locus = status.MPI_TAG - 1;
      switch ((long) temp[0])
	{
	case MIGMPI_LIKE:
	  memset (nr->locilikes, 0, sizeof (double) * world->loci);
	  memcpy (nr->param, temp + 1, sizeof (double) * (numelem - 1));
	  memcpy (nr->lparam, temp + numelem,
		  sizeof (double) * (numelem - 1));
	  fill_helper (&helper, nr->param, nr->lparam, world, nr);
	  mpi_likelihood_worker (world, &helper, rep);
	  MPI_Send (nr->locilikes, (int) world->loci, MPI_DOUBLE, MASTER,
		    locus + 1, comm_world);
	  break;
	case MIGMPI_GRADIENT:
	  memcpy (nr->param, temp + 1, sizeof (double) * (numelem - 1));
	  memcpy (nr->lparam, temp + numelem,
		  sizeof (double) * (numelem - 1));
	  fill_helper (&helper, nr->param, nr->lparam, world, nr);
	  nr->profilenum = temp[numelem2 - 1];
	  MPI_Recv (nr->indeks, (int) numelem, MPI_LONG, MASTER, MPI_ANY_TAG,
		    comm_world, &status);
	  mpi_gradient_worker (&helper, nr, world->atl);
	  MPI_Send (nr->d, (int) nr->partsize, MPI_DOUBLE, MASTER, locus + 1,
		    comm_world);
	  break;
	case MIGMPI_RESULT:
	  mpi_results_worker (temp[1], world, repstop, pack_result_buffer);
	  break;
	case MIGMPI_SUMFILE:
	  mpi_results_worker (temp[1], world, repstop, pack_sumfile_buffer);
	  break;
	case MIGMPI_MIGHIST:
	  mpi_results_worker (temp[1], world, repstop, pack_mighist_buffer);
	  break;
	case MIGMPI_END:
	  done = TRUE;
	  break;
	default:
	  fprintf (stdout, "%i> does not understand task\n", myID);
	  exit (0);
	}
    }
  destroy_nr (nr, world);
}

void
broadcast_options_master (option_fmt * options)
{
  int bufsize = MAXBUFSIZE;
  char *buffer;
  buffer = (char *) calloc (1, sizeof (char));
  bufsize = save_options_buffer (&buffer, options);
  MPI_Bcast (&bufsize, 1, MPI_LONG, MASTER, comm_world);
  MPI_Bcast (buffer, bufsize, MPI_CHAR, MASTER, comm_world);
  free (buffer);
}


void
broadcast_options_worker (option_fmt * options)
{
  long bufsize;
  char *buffer;
  MPI_Bcast (&bufsize,1, MPI_LONG, MASTER, comm_world);
  buffer = (char *) calloc (bufsize, sizeof (char));    
  MPI_Bcast (buffer, bufsize, MPI_CHAR, MASTER, comm_world);
  read_options_worker (&buffer, options);
  free (buffer);
}


void
broadcast_data_master (data_fmt * data, option_fmt * options)
{
  long bufsize;
  char *buffer;
  buffer = (char *) calloc (1, sizeof (char));
  bufsize = pack_databuffer (&buffer, data, options);
  MPI_Bcast (&bufsize, 1, MPI_LONG, MASTER, comm_world);
  MPI_Bcast (buffer, bufsize, MPI_CHAR, MASTER, comm_world);
  free (buffer);
}

void
broadcast_data_worker (data_fmt * data, option_fmt * options)
{
  long bufsize;
  char *buffer;
  MPI_Bcast (&bufsize, 1, MPI_LONG, MASTER, comm_world);
  buffer = (char *) calloc (bufsize, sizeof (char));
  MPI_Bcast (buffer, bufsize, MPI_CHAR, MASTER, comm_world);
  free (buffer);
}

long
pack_databuffer (char **buffer, data_fmt * data, option_fmt * options)
{
  long locus;
  long bufsize = 0;
  char fp[LINESIZE];
  bufsize += LINESIZE;
  *buffer = (char *) realloc(*buffer, sizeof(char) * bufsize);
  sprintf (fp, "%c%li%li%c%s\n", options->datatype, data->numpop,
	   data->loci, data->dlm, options->title);
  strcat(*buffer, fp);
  for (locus = 0; locus < data->loci; locus++)
    {
      sprintf (fp, "%li ", data->seq->sites[locus]);
      bufsize += strlen (fp)+1;
     *buffer = (char *) realloc(*buffer, sizeof(char) * bufsize);
      strcat(*buffer, fp);
    }
  sprintf (fp, "\n%li %f", data->seq->addon, data->seq->fracchange);
        bufsize += strlen (fp)+1;
     *buffer = (char *) realloc(*buffer, sizeof(char) * bufsize);
      strcat(*buffer, fp);
  return bufsize;
}

void
unpack_result_buffer (char *buffer, world_fmt * world,
		      long locus, long maxrep, long numpop)
{
  long rep, pop;
  timearchive_fmt **atl = world->atl;
  double ***apg0 = world->apg0;
  char input[LINESIZE];
  char *buf = buffer;
  strcpy (input, "");
  for (rep = 0; rep < maxrep; rep++)
    {
      sgets (input, LINESIZE, &buf);
      atl[rep][locus].param_like = atof (input);
      for (pop = 0; pop < 4 * numpop * numpop; pop++)
	{
	  sgets (input, LINESIZE, &buf);
	  atl[rep][locus].parameters[pop] = atof (input);
	}
    }
// apg0
  for (rep = 0; rep < maxrep; rep++)
    {
      for (pop = 0; pop < world->options->lsteps; pop++)
	{
	  sgets (input, LINESIZE, &buf);
	  apg0[rep][locus][pop] = atof (input);
	}
    }

}

long
pack_result_buffer (char **buffer, world_fmt * world,
		    long locus, long maxrep, long numpop)
{
  long rep, pop;
  char input[LINESIZE];
  timearchive_fmt **atl = world->atl;
  double ***apg0 = world->apg0;
   long bufsize = 1;//maxrep * (1 + 4*numpop*numpop + world->options->lsteps) * 42 ;
   (*buffer) = (char *) realloc (*buffer, sizeof (char) * bufsize);
  memset (*buffer, 0, sizeof (char) * bufsize);
  for (rep = 0; rep < maxrep; rep++)
    {
      sprintf (input, "%20.20f\n", atl[rep][locus].param_like);
	bufsize += strlen(input) + 1;
	(*buffer) = (char *) realloc (*buffer, sizeof (char) * bufsize);
      strcat (*buffer, input);
      for (pop = 0; pop < 4 * numpop * numpop; pop++)
	{
	  sprintf (input, "%20.20f\n", atl[rep][locus].parameters[pop]);
	bufsize += strlen(input) + 1;
	(*buffer) = (char *) realloc (*buffer, sizeof (char) * bufsize);
	  strcat (*buffer, input);
	}
    }
// apg0
  for (rep = 0; rep < maxrep; rep++)
    {
      for (pop = 0; pop < world->options->lsteps; pop++)
	{
	  sprintf (input, "%20.20f\n", apg0[rep][locus][pop]);
	bufsize += strlen(input) + 1;
   (*buffer) = (char *) realloc (*buffer, sizeof (char) * bufsize);
	  strcat (*buffer, input);
	}
    }
  return bufsize;
}

void
unpack_mighist_buffer (char *buffer, world_fmt * world,
		       long locus, long maxrep, long numpop)
{
  long i, j;
  mighistloci_fmt *aa;
  char input[LINESIZE];
  char *buf = buffer;
  aa = &world->mighistloci[locus];
  sgets (input, LINESIZE, &buf);
  sscanf (input, "%li\n", &aa->mighistnum);
  aa->mighist = (mighist_fmt *) realloc (aa->mighist,
					 sizeof (mighist_fmt) *
					 aa->mighistnum);
  for (j = 0; j < aa->mighistnum; j++)
    {
      sgets (input, LINESIZE, &buf);
      sscanf (input, "%li %li %li\n", &aa->mighist[j].copies,
	      &aa->mighist[j].weight, &aa->mighist[j].migeventsize);
      aa->mighist[j].migevents =
	(migevent_fmt *) realloc (aa->mighist[j].migevents,
				  sizeof (migevent_fmt) *
				  aa->mighist[j].migeventsize + 1);
      for (i = 0; i < aa->mighist[j].migeventsize; i++)
	{
	  sgets (input, LINESIZE, &buf);
	  sscanf (input, "%lf %lf %lf\n",
		  &aa->mighist[j].migevents[i][0],
		  &aa->mighist[j].migevents[i][1],
		  &aa->mighist[j].migevents[i][2]);
	}
    }
}


long
pack_mighist_buffer (char **buffer, world_fmt * world,
		     long locus, long maxrep, long numpop)
{
  long j, i;
  mighistloci_fmt *aa;
  char input[LINESIZE];
  long bufsize=1;
    aa = &world->mighistloci[locus];
  strcpy (input, "");
  for (j = 0; j < aa->mighistnum; j++)
    bufsize += 100 + aa->mighist[j].migeventsize * 100;
  (*buffer) = (char *) realloc ((*buffer), sizeof (char) * bufsize);
  memset (*buffer, 0, sizeof (char) * bufsize);
  sprintf (input, "%li\n", aa->mighistnum);
  strcat ((*buffer), input);

  for (j = 0; j < aa->mighistnum; j++)
    {
      sprintf (input, "%li %li %li\n", aa->mighist[j].copies,
	       aa->mighist[j].weight, aa->mighist[j].migeventsize);
      strcat ((*buffer), input);
      for (i = 0; i < aa->mighist[j].migeventsize; i++)
	{
	  sprintf (input, "%f %f %f\n",
		   aa->mighist[j].migevents[i][0],
		   aa->mighist[j].migevents[i][1],
		   aa->mighist[j].migevents[i][2]);
	  strcat ((*buffer), input);
	}
    }
  return strlen (*buffer) + 1;
}

void
unpack_sumfile_buffer (char *buffer, world_fmt * world,
		       long locus, long maxrep, long numpop)
{
  long r, i, j;
  long l = locus;
  char input[LINESIZE * world->numpop2];
  timearchive_fmt **ta = world->atl;
  char *buf = buffer;
  strcpy (input, "");
  for (r = 0; r < maxrep; r++)
    {
      sgets (input, LINESIZE, &buf);
      sscanf (input, "%li %li %li %lf\n", &ta[r][l].T, &ta[r][l].numpop,
	      &ta[r][l].sumtips, &ta[r][locus].param_like);
      increase_timearchive (world, l, ta[r][l].T, world->numpop, r);
      for (i = 0; i < ta[r][l].T; i++)
	{
	  sgets (input, LINESIZE, &buf);
	  sscanf (input, "%li ", &ta[r][l].tl[i].copies);
	  ta[r][l].tl[i].lcopies = ln_copies(ta[r][l].tl[i].copies);
	  for (j = 0; j < 3*numpop + numpop * (numpop-1); j++)
	  {
	      sgets (input, LINESIZE, &buf);
	      sscanf (input, "%lf\n", &ta[r][l].tl[i].data[j]);
	    }
	}
      for (i = 0; i < world->numpop2; i++)
	{
	  sgets (input, LINESIZE, &buf);
	  sscanf (input, "%lg %lg\n", &ta[r][l].param[i],
		  &ta[r][l].param0[i]);
	}
      log_param0 (ta[r][l].param0, ta[r][l].lparam0, world->numpop2);
      sgets (input, LINESIZE, &buf);
      sscanf (input, "%li %lg\n", &ta[r][l].trials, &ta[r][l].normd);
    }
}


long
pack_sumfile_buffer (char **buffer, world_fmt * world,
		     long locus, long maxrep, long numpop)
{
  long r, i, j;
  long numpop2 = numpop * numpop;
  long l = locus;
  char input[LINESIZE];
  timearchive_fmt **ta = world->atl;
  // calc bufsize needed
  long bufsize = 1;
  (*buffer) = (char *) realloc ((*buffer), sizeof (char) * bufsize);
  memset (*buffer, 0, sizeof (char) * bufsize);
  for (r = 0; r < maxrep; r++)
    {
      sprintf (input, "%20li %20li %20li %20.20f \n", ta[r][l].T, ta[r][l].numpop,
	       ta[r][l].sumtips, ta[r][l].param_like);
	bufsize += strlen(input) + 1;
   (*buffer) = (char *) realloc (*buffer, sizeof (char) * bufsize);
      strcat ((*buffer), input);
      for (i = 0; i < ta[r][l].T; i++)
	{
	  sprintf (input, "%20li\n", ta[r][l].tl[i].copies);
	bufsize += strlen(input) + 1;
   (*buffer) = (char *) realloc (*buffer, sizeof (char) * bufsize);
	  strcat ((*buffer), input);
	  for (j = 0; j < 3*numpop + numpop * (numpop-1); j++)
	    {
	      sprintf (input, "%20.20f\n",
		       ta[r][l].tl[i].data[j]);
	bufsize += strlen(input) + 1;
   (*buffer) = (char *) realloc (*buffer, sizeof (char) * bufsize);
	      strcat ((*buffer), input);
	    }
	}
      for (i = 0; i < numpop2; i++)
	{
	  sprintf (input, "%20.20e %20.20e\n", ta[r][l].param[i],
		   ta[r][l].param0[i]);
	bufsize += strlen(input) + 1;
   (*buffer) = (char *) realloc (*buffer, sizeof (char) * bufsize);
	  strcat ((*buffer), input);
	}
      sprintf (input, "%20li %20.20e\n", ta[r][l].trials, ta[r][l].normd);
	bufsize += strlen(input) + 1;
   (*buffer) = (char *) realloc (*buffer, sizeof (char) * bufsize);
      strcat ((*buffer), input);
    }
  return bufsize;
}

void
mpi_results_master (double sendtype, world_fmt * world, long maxreplicate,
		    void (*unpack) (char *buffer, world_fmt * world,
				    long locus, long maxrep, long numpop))
{
  long numpop = world->numpop;
  int bufsize = 1;
  boolean done = FALSE;
  char *buffer, *sbuffer;
  double *temp;
  int locus;
  long z, tag, sender;
  MPI_Status status;
  long numelem = world->numpop2 + (world->options->gamma ? 2 : 1);
  long numelem2 = 2 * numelem;
  temp = (double *) calloc (numelem2, sizeof (double));
  buffer = (char *) calloc (bufsize, sizeof (char));
  temp[0] = sendtype;
  temp[1] = bufsize;
  for (locus = 1; locus <  MIN(world->loci+1,numcpu); locus++)
    {
      MPI_Send (temp, numelem2, MPI_DOUBLE, locus, locus, comm_world);
    }
  z = 0;
  while (!done)
    {
      if (z++ >= world->loci)
	{
	  break;
	}
      MPI_Recv (&bufsize, 1, MPI_LONG, MPI_ANY_SOURCE, MPI_ANY_TAG,
		comm_world, &status);
      buffer = (char *) realloc (buffer, sizeof (char) * (bufsize + 1));
      memset (buffer, 0, sizeof (char) * (bufsize + 1));
      sender = status.MPI_SOURCE;
      tag = status.MPI_TAG;
      MPI_Recv (buffer, bufsize, MPI_CHAR,
		sender, tag, comm_world, &status);
      sbuffer = buffer;
      (*unpack) (buffer, world, tag - 1, maxreplicate, numpop);
    }
  free (sbuffer);
  free (temp);
}

void
mpi_results_worker (long bufs, world_fmt * world, long maxrep,
		    long (*pack) (char **buffer, world_fmt * world,
				  long locus, long maxrep, long numpop))
{
  long numpop = world->numpop;
  int ww, locus;
  char *buffer;
  char **buf;
  long bufsize = 1;
  buffer = (char *) calloc (bufsize, sizeof (char));	//some small value
  for (ww = 0; ww < locidone; ww++)
    {
      buffer = (char *) realloc (buffer, sizeof (char));	//some small value
      buf = &buffer;
      locus = world->who[ww];
      bufsize = (*pack) (&buffer, world, locus, maxrep, numpop);
	fflush (stdout);
      MPI_Send (&bufsize, 1, MPI_LONG, MASTER, locus + 1, comm_world);
      MPI_Send (buffer, bufsize, MPI_CHAR, MASTER, locus + 1,
		comm_world);
    }
  free (*buf);
}

// slownet profiler
// 
#ifdef SLOWNET
void mpi_broadcast_results(world_fmt *world)
{
   long locus;
   long bufsize;
   char **buffer = &world->buffer;
   long maxreplicate = (world->options->replicate
		       && world->options->replicatenum >
		       0) ? world->options->replicatenum : 1;
  MPI_Barrier(comm_world);
	for(locus=0; locus < world->loci+1; locus++)
  {
 	if(myID==MASTER)
		bufsize = pack_result_buffer(buffer,world, locus, maxreplicate, world->numpop);
	MPI_Bcast (&bufsize, 1, MPI_LONG, MASTER, comm_world);
	*buffer = (char *) realloc(*buffer, sizeof(char)*bufsize+1);
        MPI_Bcast (*buffer, bufsize, MPI_CHAR, MASTER, comm_world);
	if(myID!=MASTER)
		unpack_result_buffer(*buffer, world, locus, maxreplicate, world->numpop);
	memset(*buffer,0,sizeof(char)*bufsize);	 
  }
  MPI_Barrier(comm_world);

  for(locus=0; locus < world->loci; locus++)
  {
	if(myID==MASTER)
		bufsize = pack_sumfile_buffer(buffer,world, locus, maxreplicate, world->numpop);
	MPI_Bcast (&bufsize, 1, MPI_LONG, MASTER, comm_world);
	*buffer = (char *) realloc(*buffer, sizeof(char)*(bufsize+1));
        MPI_Bcast (*buffer, bufsize, MPI_CHAR, MASTER, comm_world);
	if(myID!=MASTER)
		unpack_sumfile_buffer(*buffer, world, locus, maxreplicate, world->numpop); 
	memset(*buffer,0,sizeof(char)*bufsize);
   }
}

void
mpi_profiles_master (world_fmt *world, long nparam, int *profilewho)
{
  long pnum;
  int sender;
  int i;
  long pdone;
  MPI_Status status;
  long temp[3];
  long numsent = 0;
  long bufsize;
  long quantsize;
  char **buffer = &world->buffer;
  FILE *outfile = world->outfile;
  for (pnum = 0; pnum < MIN (nparam, numcpu - 1); pnum++)
    {
      MPI_Send (&pnum, 1, MPI_LONG, pnum + 1, pnum + 1, comm_world);
      numsent++;
    }
  for (pnum = 0; pnum < nparam; pnum++)
    {   
	   MPI_Recv (temp, 3, MPI_LONG, MPI_ANY_SOURCE, MPI_ANY_TAG,
		comm_world, &status);
	pdone = temp[0];
	bufsize =temp[1];
	quantsize = temp[2];
      sender = status.MPI_SOURCE;
      profilewho[pdone] = sender;
	*buffer = (char*) realloc(*buffer, sizeof(char)*(bufsize+quantsize+1));
       memset(*buffer,0,sizeof(char)*(bufsize+quantsize+1));
      MPI_Recv (*buffer, bufsize+quantsize, MPI_CHAR, sender, MPI_ANY_TAG, comm_world, &status);
      unpack_quantile((*buffer)+bufsize, world->quantiles[pdone], GRIDSIZE);
	memset((*buffer)+bufsize,0,sizeof(char)*quantsize);	 
      fprintf(outfile,"%s\n\n",*buffer);
      if (numsent < nparam)
	{
	  MPI_Send (&numsent, 1, MPI_LONG, sender, numsent + 1,
		    comm_world);
	  numsent++;
	}
      else
	{
	  MPI_Send (&nparam, 1, MPI_LONG, sender, 0, comm_world);	//end of parameter list
	}
    }
}

void
mpi_profiles_worker (world_fmt *world, long *gmaxptr)
{
  boolean done = FALSE;
  long pnum;
  long temp[3];
  char *quantilebuffer;
  MPI_Status status;
  long qbufsize = 1;
  quantilebuffer = (char *) calloc(qbufsize, sizeof(char));
  while (!done)
    {
      MPI_Recv (&pnum, 1, MPI_LONG, MASTER, MPI_ANY_TAG,
		comm_world, &status);
      if (status.MPI_TAG != 0)	//stop condition
	{
	  print_profile_likelihood_driver (pnum, world, gmaxptr);
	  temp[0] = pnum;
	  temp[1] = strlen(world->buffer);
	   temp[2] = pack_quantile(&quantilebuffer, world->quantiles[pnum], GRIDSIZE);
	  world->buffer = (char*) realloc(world->buffer, sizeof(char) * (temp[1] + temp[2] + 1));
	  strcat(world->buffer,quantilebuffer);
	  MPI_Send (temp, 3, MPI_LONG, MASTER, pnum + 1, comm_world);
	  MPI_Send (world->buffer, temp[1]+temp[2], MPI_CHAR, MASTER, pnum + 1, comm_world);
	  world->profilewho[profiledone++] = pnum;
	}
      else
	{
	  done = TRUE;
	}
    }
	free(quantilebuffer);
}

long pack_quantile(char **buffer, quantile_fmt quant, long n)
{
	long i;
	char fp[LINESIZE];
	long bufsize=LINESIZE;
	*buffer = (char *) realloc(*buffer, sizeof(char) * bufsize);
	sprintf(*buffer,"QUANTILEBUFFER:\n %s\n",quant.name);
	for(i=0; i<n; i++)
	{
		sprintf(fp,"%20.20f\n",quant.param[i]);
		bufsize += strlen(fp) +1;
		*buffer = (char *) realloc(*buffer, sizeof(char) * bufsize);
		strcat(*buffer,fp);
	}
	return bufsize;
}

void unpack_quantile(char *buffer, quantile_fmt quant, long n)
{
	long i;
	char input[LINESIZE];
	char *buf = buffer;
	strcpy (input, "");
	 sgets (input, LINESIZE, &buf);
	 sgets (input, LINESIZE, &buf);
	strcpy(quant.name,input);
	for(i=0; i<n; i++)
	{
	     sgets (input, LINESIZE, &buf);
	     quant.param[i] = atof(input);
	}
}

#endif

/*
// send the data over all loci/replicates to all nodes
// including the master node, so that all nodes can then 
// start calculating profiles [see calc_profiles()]
//
void distribute_locidata(world_fmt *world)
{
  char *buffer;
  pack_loci_data(world, &buffer);
  MPI_allgather(buffer);
  unpack_loci_data(buffer, world);
  free(buffer);
}

void pack_loci_data(world_fmt *world, char **buffer)
{
  long replicates = world->options->repl
  *buffer = realloc(*buffer,LINESIZE);
  hits = sscanf (input, "%li %li %li %li %li", &world->loci, &world->numpop, &world->numpop2, &tmp, &replicates);  
}
*/
// necessary for analyzing old sumfiles using MPI
//
// master is reusing  mpi_runloci_master()
void
assignloci_worker (world_fmt * world)
{
  boolean done = FALSE;
  long locus;
  MPI_Status status;
  while (!done)
    {
      MPI_Recv (&locus, 1, MPI_LONG, MASTER, MPI_ANY_TAG,
		comm_world, &status);
      if (status.MPI_TAG != 0)	//stop condition
	{
	  swap_atl (locus, locidone, world);

	  MPI_Send (&locus, 1, MPI_LONG, MASTER, locus + 1, comm_world);
	  /* we want to know what locus we worked for
	     - to control the work sent by master
	     - to use in setup_parameter0() [combroyden2.c] */
	  world->who[locidone++] = locus;
	}
      else
	{
	  done = TRUE;
	}
    }
}

void
swap_atl (long from, long to, world_fmt * world)
{
  long r;
  timearchive_fmt *tmp;
  for (r = 0; r < world->options->replicatenum; r++)
    {
      tmp = &world->atl[r][to];
      world->atl[r][to] = world->atl[r][from];
      world->atl[r][from] = *tmp;
    }
}


#ifdef SLOWNET
void setup_parameter0_slowmpi(world_fmt * world, nr_fmt * nr, long repkind, long repstart,
		  long repstop, long loci, long kind, boolean multilocus)
{
  long locus, r;
  if (myID != MASTER)
    {
      if (multilocus)
	{
	for (locus = 0; locus < loci; locus++)
	    {
		 if (repkind == SINGLECHAIN)
		{
		  for (r = repstart; r < repstop; r++)
		    create_apg0 (nr->apg0[r][locus], nr,
				 &world->atl[r][locus]);
		}
	      else
		{
		  if (kind != PROFILE)
		    {
		      for (r = repstart; r < repstop; r++)
			create_apg0 (nr->apg0[r][locus], nr,
				     &world->atl[r][locus]);
		      interpolate_like (nr, locus);
		    }
		  else
		    {
		      for (r = repstart; r < repstop; r++)
			create_multiapg0 (nr->apg0[r][locus], nr, r, locus);
		    }
		}
	    }
	}
      else			//single locus
	{
	  if (repkind == SINGLECHAIN)
	    {
	      for (r = repstart; r < repstop; r++)
		create_apg0 (nr->apg0[r][world->locus], nr,
			     &world->atl[r][world->locus]);
	    }
	  else
	    {
	      if (kind != PROFILE)
		{
		  for (r = repstart; r < repstop; r++)
		    create_apg0 (nr->apg0[r][world->locus], nr,
				 &world->atl[r][world->locus]);
		  interpolate_like (nr, world->locus);
		}
	      for (r = repstart; r < repstop; r++)
		create_multiapg0 (nr->apg0[r][world->locus], nr, r,
				  world->locus);
	    }
	}
    }
}
#endif

void setup_parameter0_mpi(world_fmt * world, nr_fmt * nr, long repkind, long repstart,
		  long repstop, long loci, long kind, boolean multilocus)
{
  long locus, r;
  long ll;
  if (myID != MASTER)
    {
      if (multilocus)
	{
	  for (ll = 0; ll < locidone; ll++)
	    {
	      locus = world->locus = world->who[ll];
	      if (repkind == SINGLECHAIN)
		{
		  for (r = repstart; r < repstop; r++)
		    create_apg0 (nr->apg0[r][locus], nr,
				 &world->atl[r][locus]);
		}
	      else
		{
		  if (kind != PROFILE)
		    {
		      for (r = repstart; r < repstop; r++)
			create_apg0 (nr->apg0[r][locus], nr,
				     &world->atl[r][locus]);
		      interpolate_like (nr, locus);
		    }
		  else
		    {
		      for (r = repstart; r < repstop; r++)
			create_multiapg0 (nr->apg0[r][locus], nr, r, locus);
		    }
		}
	    }
	}
      else			//single locus
	{
	  if (repkind == SINGLECHAIN)
	    {
	      for (r = repstart; r < repstop; r++)
		create_apg0 (nr->apg0[r][world->locus], nr,
			     &world->atl[r][world->locus]);
	    }
	  else
	    {
	      if (kind != PROFILE)
		{
		  for (r = repstart; r < repstop; r++)
		    create_apg0 (nr->apg0[r][world->locus], nr,
				 &world->atl[r][world->locus]);
		  interpolate_like (nr, world->locus);
		}
	      for (r = repstart; r < repstop; r++)
		create_multiapg0 (nr->apg0[r][world->locus], nr, r,
				  world->locus);
	    }
	}
    }
}

#endif
