/*
* generate workload based on statistical model
*
* for each job, decide the size (number of nodes used), the runtime,
* and the repetition count (how many times it will be run).
* the output is one line per job, with these 3 values.
*
* the size is based on a complex discrete distribution.
* the runtime is hyperexponential, based on the size.
* the repetition is generalized Zipf (harmonic)
*
* the arrival process was not modeled in the original version; it was
* part of the simulation that uses this workload as input. The original
* papers used a simple Poisson process, and varied the mean of the
* interarrival times to modify the load. this version has the option
* to include such arrival times.
*
* author: Dror Feitelson, 1995
* modifications:
* 1996: improved model (see below)
* 18.11.99: bug fix in adding weight at multiples of 10 (thanks to Kento Aida)
* 27.6.00: bug fix in extending the runtime at powers of two
* 6.7.00: add option of arrival times, change floats to doubles
*/
#include
#include
/*
* two versions of the model are available.
* the original version, from the 1996 paper on "packing schemes for
* gang scheduling", is obtained by defining ORIGINAL_MODEL below.
* the somewhat improved model from the 1997 paper on "improved utilization
* and responsiveness with gang acheduling" is the default.
*/
#if 0
#define ORIGINAL_MODEL
#endif
/*
* the program can include arrival times, or leave them undefined.
* no arrival times is the default.
*/
#if 0
#define WITH_ARRIVALS
#endif
/*
* the program will create gnuplot data files of the model distributions
* if the following definition is made effective. the files are created in
* a subdirectory "plt".
*/
#if 0
#define MAKE_PLOTS
#endif
/*
* The following causes output in the standard format.
* The alternative is a line with 3 numbers for each job:
* the number of processors, the runtime, and the number of repetitions.
*/
#define STD_FORMAT
/*
* the following constants may be changed to fit your needs, but there are
* no guarantees that everything will work as it should... so check the code!
*/
#define LEN 300000 /* jobs generated */
#define MAX_SIZE 128 /* machine size >= 8 */
#define TIME_LIM 64800 /* longest job allowed - 18 hr */
#define FACTOR 9 /* about MAX_TIME / log(TIME_LIM) */
#define MAX_REP 1000 /* far upper bound on number of repetitions */
#define MAX_TIME 100 /* for distribution of runtimes */
#define ARR_FACTOR 1500 /* mean of interarrival times distribution */
#define MAXINT 0x7fffffff
double size_dist[ MAX_SIZE ]; /* cummulative distribution of sizes */
int time_dist[ MAX_TIME ]; /* cummulative distribution of runtimes */
double rep_dist[ MAX_REP ]; /* cummulative distribution of repetitions */
#ifdef MAKE_PLOTS
char filename[ 20 ];
int size_hist[ MAX_SIZE ];
double time_hist[ MAX_SIZE ];
int rep_hist[ MAX_REP ];
struct {
double rtime;
int rep;
int next;
} jobs[ LEN ];
int head[ MAX_SIZE ];
#endif
main()
{
int i, j, k, n, ntot, size, c, rep, tot, rtbkt;
double sum, p, runtime, m, t;
FILE *hist, *times, *reps;
#ifdef WITH_ARRIVALS
double next_arr;
#endif
/* initialize */
#ifdef MAKE_PLOTS
tot = 0;
for (i=1 ; i size_dist[i])
i++;
size = i + 1;
/* assign runtime */
#ifdef ORIGINAL_MODEL
/*
* in the original model, runtimes were from a two-stage
* hyperexponential distribution, and the probability
* of choosing each branch depended on the size
*/
p = ((double)random())/MAXINT;
m = (p < (0.95 - 0.20*(((double)size)/MAX_SIZE))) ? 1 : 7;
runtime = -m * (float)log( ((double)random()) / MAXINT );
#else
/*
* in the improved model, runtimes are from a three-stage
* hyperexponential distribution, and the probability of
* choosing each branch depends on the size. in addition,
* there is an upper bound on the runtime.
*/
do {
p = ((double)random())/MAXINT;
if (p < (0.90 - 0.65*( sqrt(((double)size)/MAX_SIZE) )))
m = 50;
else if (p < (0.97 - 0.37*( sqrt(((double)size)/MAX_SIZE) )))
m = 900;
else
m = 20000;
if ((size == 2) || (size == 4) || (size == 8) || (size == 16) ||
(size == 32) || (size == 64) || (size == 128))
m *= 2;
runtime = -m * log( ((double)random()) / MAXINT );
}
while (runtime > TIME_LIM);
#endif
/* assign repetition */
p = ((double)random())/MAXINT;
i = 0;
while (p > rep_dist[i])
i++;
rep = i;
#ifdef STD_FORMAT
/* Now output the data for this job, using the standard format.
* the fields are:
* job number = ntot [start from 1, not from 0, and count repetitions]
* submit time = -1 [if not modeled] or next_arr
* wait time = -1 [queueing is a scheduler's problem, not
* part of the workload model]
* run time = runtime
* used processors = p_needed
* average CPU time = (cpu_time_used / p_needed) [same as runtime]
* used memory = -1 [not modeled]
* requested processors = -1 [not modeled]
* requested time = -1 [not modeled]
* requested memory = -1 [not modeled]
* completed = 1 [all jobs complete OK in the model]
* user ID = -1 [not modeled]
* group ID = -1 [not modeled]
* application = -1 [not modeled]
* queue = -1 [not modeled]
* partition = -1 [not modeled]
* preceding job = -1 [first in sequence does not depend on anything]
* think time = -1 [not modeled]
*/
#ifndef WITH_ARRIVALS
printf("%6d -1 -1 %12.4f %3d %12.4f -1 -1 -1 -1 1 -1 -1 -1 -1 -1 -1 -1\n",
++ntot,
runtime,
size,
runtime );
#else
printf("%6d %12.4f -1 %12.4f %3d %12.4f -1 -1 -1 -1 1 -1 -1 -1 -1 -1 -1 -1\n",
++ntot,
next_arr,
runtime,
size,
runtime );
next_arr += -ARR_FACTOR * log( ((double)random()) / MAXINT );
#endif
/*
* if the number of repetitions is >1, generate additional jobs
* that depend on each other
*/
for (i=0 ; i tot/4) {
rep = tot/4 - n;
}
else {
rep = jobs[head[j]].rep;
}
n += rep;
size += (j+1) * rep;
runtime += jobs[head[j]].rtime * rep;
rtbkt = (int)(log((double)jobs[head[j]].rtime)*FACTOR);
if (rtbkt < 0) rtbkt = 0;
if (rtbkt >= MAX_TIME) rtbkt = MAX_TIME-1;
time_dist[ rtbkt ] += rep;
if (rep == jobs[head[j]].rep) {
head[j] = jobs[head[j]].next;
}
else {
jobs[head[j]].rep -= rep;
}
}
fprintf(hist, "%f %f\n", ((double)size)/n, runtime/n);
sum = 0;
for (k=0 ; k