// Copyright 1995 Barbara Liskov

/*
\section{Front-End Connection Management}

This file contains the code that manages interactions with a front-end.
We allocate a thread per front-end.  The thread loops around reading
messages from the front-end, and sending appropriates replies back
to the front-end.

This file handles all FE manager functions except the two-phase commit,
which is in commit.cc.

*/

/*
\subsection{Todo}

\begin{itemize}
\item get "MAX_OBJECTS" from configuration?
\item Set-up non-blocking IO so that if the connection to the FE
      gets backed up, other portions of the OR do not get delayed
      waiting for fetched objects to be released.
\item Can TCP streams deadlock if both ends try to write a lot
      of data?  (I am worried about buffers at both end filling
      up and neither end reading any data because they are blocked
      waiting to write.)
\end{itemize}
*/

#include "common/compat.h"

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <signal.h>
#include <unistd.h>

#include "common/fail.h"
#include "common/Timer.h"
#include "common/fe_or_msg.h"
#include "common/network.h"
#include "common/or_obj.h"
#include "common/orefs.h"
#include "common/samples.h"
#include "common/th_assert.h"
#include "common/oref_set.h"
#include "common/unparser.h"
#include "config/vdefs/COMPILER.h"

#include "mm/handle.h"
#include "mm/mm.h"

#include "fe_manager.h"
#include "pref_set.t"
#include "prefetch_info.h"
#include "fe_table.h"
#include "or.h"
#include "or_config.h"
#include "tm.h"
#include "thread.h"
#include "rwlock_mutex.h"

#include <sys/uio.h>


// Force template instantiation for Pref_set.
#if __DECCXX
#pragma define_template Buckets<IntKey, Bitmap* >
#pragma define_template BHash<IntKey, Bitmap* , Buckets<IntKey, Bitmap* > >
#pragma define_template BucketsImpl<IntKey, Bitmap*>
#pragma define_template BHashGenerator<IntKey, Bitmap*, Buckets<IntKey, Bitmap* > >
#pragma define_template BucketsGenerator<IntKey, Bitmap*>
#endif

// Globals for statistics

Timer fetch_timer;
float total_fetch_time = 0.0;

// Number of milliseconds to wait in the main loop of the fe_manager
#define FE_OR_CONN_TIMEOUT 1000

// Maximum number of objects sent in one message
#define MAX_OBJECTS	1000
#if (MAX_OBJECTS + 2) > UIO_MAXIOV
#error "MAX_OBJECTS set higher than possible on this system"
#endif

float total_time, recv_time, send_time, validate_time;
Timer total_timer, recv_timer, send_timer, validate_timer;
Timer cumm_total_timer, cumm_recv_timer, cumm_send_timer, cumm_validate_timer;

// The prefetch object set size is limited to be the following
// function of the prefetch size

// \subsection{FE Manager Implementation}

// XXX - We should check whether an old FE is reconnecting.
// If so, we should reuse the corresponding FE_table.

FE_manager::FE_manager(fe_num id, Network* n) {
    net		 = n;
#if USE_SUBPAGE_PREFETCHING
    pinfo        = new Prefetch_Info(MAX_OBJECTS);
#else
    list	 = new MM_HandleList;
#endif

    prefetch_set = new Pref_set(1024);
    max_objects	 = MAX_OBJECTS;
    desc	 = new or_objdesc[max_objects];
    iov		 = new iovec[max_objects + 2];
    fetch_count	 = new Samples;
    mutex_commit = new Mutex;
    cond_commit  = new Condition(mutex_commit);
    tno = 1;

    fe_id = id;

    /* Make sure writev will handle our iovecs */
    assert(max_objects + 2 <= UIO_MAXIOV);
    or->fe_info_set->lock->write_lock();
    fe_inf = or->fe_info_set->add_FE(id, this);
    or->fe_info_set->lock->write_unlock();
    unparser unp(0);
    id.unparse(&unp);
}

FE_manager::~FE_manager() {
    // If any handles have not been released, do it now.
#if USE_SUBPAGE_PREFETCHING
    delete pinfo;
#else
    if (list->size() > 0)
	or->mm->release_list(list);
    delete list;
#endif

#if !COMPILER
    fprintf(stderr, "FE DIED ");
    unparser unp(0);
    fe_id.unparse(&unp);
#endif

    delete [] iov;
    delete [] desc;

    delete net;
    delete prefetch_set;
    delete fetch_count;
    delete mutex_commit;
    delete cond_commit;

    or->fe_info_set->lock->write_lock();
    or->fe_info_set->remove_FE(fe_inf->id);
    or->fe_info_set->lock->write_unlock();
    
    delete fe_inf;
}

FE_info* FE_manager::fe_info() {
    return fe_inf;
}

fe_num FE_manager::id() {
    return fe_id;
}

int  FE_manager::wait_or_timeout(fe_message *msg, int msec) {
    // modifies msg
    // effects  waits for a message from the FE or times out after msec
    //          milliseconds. Returns -1 there was an error on the
    //          network connection with this FE. Returns 0 if there was a
    //          timeout. Else decodes the received message from the network,
    //          puts it into msg and returns 1

    int res = net->wait_or_timeout(msec);
    if (res > 0) {
	if (!msg->decode(net)) res = -1;
    }
    return res;
}


static Timer or_fetch_time;

void FE_manager::main() {

    fe_message msg;
    int wait_status;
    // The various timers get activated only if the environment variable
    // THOR_PRINT_TIME has been set
    static bool printres = getenv("THOR_PRINT_TIME")? TRUE: FALSE;

    // Disable pipe signals
    struct sigaction todo;
    todo.sa_handler = SIG_IGN;
    sigemptyset(&todo.sa_mask);
    todo.sa_flags = 0;
    sigaction(SIGPIPE, &todo, 0);

    while ((wait_status = wait_or_timeout(&msg, FE_OR_CONN_TIMEOUT)) >=0) {
	//    while (net->ok() && msg.decode(net)) {
	//   Send invalidation message if  necessary 
	send_invalidation_message_if_needed();
	if (wait_status == 0) continue; // The network timed out.
	
	/* decode fe request */
	switch (msg.msgtype) {
	  case FE_FETCHROOT:
	    fetch_root();
	    break;
	  case FE_FETCH:
#if PAGING
	    fetch_page(msg.u.fetch.o);
#else
	    fetch_timer.reset(); fetch_timer.start();
            or_fetch_time.start();            
	    fetch_objects(msg.u.fetch.o, &msg.u.fetch.prefetch, 
			  msg.u.fetch.clear_pref_set );
	    fetch_timer.stop();
	    total_fetch_time += fetch_timer.elapsed();
#endif
	    break;
  	  case FE_PREPARE_COORD:
	    if (or->config->debug_level > 1)
		printf("Got coordinator prepare message from FE\n");
	    
	    total_timer.reset(); total_timer.start();
	    cumm_total_timer.start();

	    commit_trans(TRUE);

	    total_timer.stop(); total_time = total_timer.elapsed();
	    cumm_total_timer.stop();
	    if (printres)
		fprintf(stderr, "Recv = %9.5f, Validate = %9.5f,"
			" Send = %9.5f, Total = %9.5f\n", recv_time,
			validate_time, send_time, total_time);
	    break;
	  case FE_PREPARE_PART:
	    if (or->config->debug_level > 1)
		printf("Got participant prepare message from FE\n");
	    cumm_total_timer.start();
	    total_timer.reset(); total_timer.start();

	    commit_trans(FALSE);

	    total_timer.stop(); total_time = total_timer.elapsed();
	    cumm_total_timer.stop();
	    if (printres)
		fprintf(stderr, "Recv = %9.5f, Validate = %9.5f,"
			" Send = %9.5f, Total = %9.5f\n", recv_time,
			validate_time, send_time, total_time);
	    
	    break;
	  case FE_INVALID_ACK:
	    process_invalid_ack(msg.u.invalid_ack.last_message_seen);
	    break;
	  case FE_STAT:
	    handle_stat_message();
	    break;
	  case FE_LOG_SIZE:
	    or->mm->resize_log(msg.u.log_size);
	    break;
	  case FE_CACHE_SIZE:
	    or->mm->resize_cache(msg.u.cache_size);
	    break;
	  case FE_DSPACE_USE:
	    or->mm->resize_dspace(msg.u.dspace_use);
	    break;
	  default:
	    /* XXX - Log error */
	    warn("FE sent invalid message type %d\n", msg.msgtype);
	    net->shutdown();
	    break;
	};
    }

    if (net->ok()) {
	// Finished because of connection shutdown
	// XXX - Log something?
    }

#if !COMPILER
    fetch_count->report("objects per fetch");
    fprintf(stderr, "accumulated fetch time (sec) : %f\n", 
					or_fetch_time.elapsed());
#endif
}

// \subsection{Message handlers}

void FE_manager::fetch_root() {
    or_message msg;
    msg.msgtype = OR_ROOT;
    msg.u.root = or->mm->directory();

    msg.encode(net);
    net->flush();
}

void FE_manager::handle_stat_message() {
    or_message  reply;
    or_stat     stat;

    // Fill in the "stat" structure by asking various modules.
    or->tm->stat(stat);
    or->mm->stat(stat);

    // Now get the OR wall-clock time.
    struct timespec t;
    getclock(TIMEOFDAY, &t);
    stat.clock.seconds = t.tv_sec;
    stat.clock.micros  = t.tv_nsec / 1000;

    reply.msgtype = OR_STAT;
    reply.encode(net);
    stat.encode(net);
    net->flush();
    fprintf(stderr, "Cummulative data\n");
    float cumm_total_time = cumm_total_timer.elapsed();
    float cumm_recv_time = cumm_recv_timer.elapsed();
    float cumm_send_time = cumm_send_timer.elapsed();
    float cumm_validate_time = cumm_validate_timer.elapsed();
    fprintf(stderr, "C_Recv = %9.5f, C_Validate = %9.5f,"
	    " C_Send = %9.5f, C_Total = %9.5f\n", cumm_recv_time,
	    cumm_validate_time, cumm_send_time, cumm_total_time);
}

void FE_manager::fetch_objects(Oref o, prefetch_hint const* hint, int clear_pref) {
    // The code has been structured so that the FE_Info lock is grabbed, o
    // is entered into the FE_table and the lock released. The object is
    // then fetched and then prefetching takes place with the FE_Info lock
    // held

    MM_Handle* first;
    // Add initial object to fe_table
    fe_inf->lock->write_lock(); {
	fe_inf->fe_table->add_object(o);
    } fe_inf->lock->write_unlock();
    
    // The lock on FE_info has been released during the actual fetch
    first = or->mm->fetch(o);
    if (first == 0) {
	/* Could not find object */
	fprintf(stderr, "fetch failed: %d:%d\n",OREF_SEGMENT(o),OREF_INDEX(o));
	
	or_message msg;
	msg.msgtype = OR_FETCH_DENY;
	msg.u.denied = o;
	msg.encode(net);
	net->flush();
	return;
    }
    
    // Grab the lock again
    fe_inf->lock->write_lock(); {
      // Trim the prefetch set if the FE GCed recently.
      if (clear_pref) 
	prefetch_set->clear();
#if USE_SUBPAGE_PREFETCHING
      // Add all prefetched objects to fe table. 
      // If they are all from the same segment and we
      // are only remembering which segments were sent to the fe,
      // then insert only the segment of the first object.
      pinfo->first = first;
      pinfo->no_objects = 1;
      pinfo->pref[0].obj_ = first->obj();
      pinfo->pref[0].oref = first->oref();
      or->mm->subpage_prefetch(pinfo, prefetch_set, hint);
      Oref first_oref = first->oref();
      if (!fe_inf->fe_table->is_fine_grain(OREF_SEGMENT(first_oref)))
	  fe_inf->fe_table->add_object(first_oref);
#else
      list->clear();
      list->append(first);
      or->mm->pointer_prefetch(list, prefetch_set, hint);
      Oref first_oref = first->oref();
      int num = list->size();
      for (int i = 1; i < num; i++)
	  fe_inf->fe_table->add_object(list->slot(i)->oref());
#endif
    } fe_inf->lock->write_unlock();
    
#if USE_SUBPAGE_PREFETCHING
    fetch_count->add(pinfo->no_objects);
    send_objects();
    or->mm->mutex->grab(); {
      pinfo->clear(TRUE);
    } or->mm->mutex->release();
#else
    fetch_count->add(list->size());
    send_objects();
    or->mm->release_list(list);
    list->clear();
#endif
}

#if PAGING
void FE_manager::fetch_page(Oref o) {
    Page* p = or->pages->fetch(OREF_SEGMENT(o));

    /* Could not find page */
    if (p == 0) {
	or_message msg;
	msg.msgtype = OR_FETCH_DENY;
	msg.u.denied = o;
	msg.encode(net);
	net->flush();
	return;
    }
    else {
      send_page(p);
    }
}
#endif

void FE_manager::send_invalidation_message_if_needed() {
    // Send any invalid object set information that has not
    // been sent to this frontend yet

    fe_inf->lock->write_lock();

    Invalid_set *invset = fe_inf->invalid_objs;
    if (fe_inf->current_message > fe_inf->max_message) {
	// There is invalidation information to be sent to the frontend
	orefs *inv_orefs = invset->invalidated_since(fe_inf->max_message);
	int size = inv_orefs->size();
	th_assert(size > 0, "Size of invalid orefs is zero at OR");
	or_message msg;
	msg.msgtype = OR_INVALIDATION;
	msg.u.invalid.count = size;
	msg.u.invalid.msg_start = fe_inf->max_message + 1;
	msg.u.invalid.msg_end = fe_inf->current_message;

	msg.encode(net);
	net->send_buffer((caddr_t) inv_orefs->as_pointer(),
			 size*sizeof(Oref));
	net->flush();
	delete inv_orefs;
	if (or->config->debug_level)
	    fprintf(stderr, "Sent Inv message: S: %d E:%d Num: %d\n", 
		    fe_inf->max_message + 1,
		    fe_inf->current_message,
		    size);
	fe_inf->max_message = fe_inf->current_message;

	// Drop all information about recently prefetched objects
	// because maybe the client wants all of those again.
	prefetch_set->clear();
    }

    fe_inf->lock->write_unlock();
}
    
void FE_manager::process_invalid_ack(ubits32 last_msg_seen) {
    fe_inf->lock->write_lock();
    {
	th_assert(last_msg_seen <= fe_inf->max_message,
		  "FE acked bad number for invalid objects");
	fe_inf->invalid_objs->remove_before(last_msg_seen);
    }
    fe_inf->lock->write_unlock();

    if (or->config->debug_level)
	    fprintf(stderr, "Received Inv Ack. Msg seen: %d\n",
		    last_msg_seen);
}

void FE_manager::send_objects() {
    /* We can send at most max_objects objects in one shot */

#if USE_SUBPAGE_PREFETCHING
    int size = pinfo->no_objects;
#else
    int size = list->size();
#endif
    
    int start = 0;
    while (start < size) {
	/* Send one chunk of objects */
	int number = size - start;
	if (number > max_objects)
	    number = max_objects;

	send_chunk(start, number);
	start += number;
    }
}

#if PAGING
void FE_manager::send_page(Page const* p) {
//    iov[0].iov_base = (char *)p;
//    iov[0].iov_len = PAGE_SLOTS * sizeof(OR_slot);
//    net->send_vector(iov, 1);
//    net->send_buffer(p, PAGE_SLOTS * sizeof(OR_slot));
    net->send_buffer(p, 100);
    net->flush();
}
#endif

void FE_manager::send_chunk(int start, int number) {
    OR_slot dispatch[DEFAULT_NULLSPACE];
    int i;

    /* Set-up initial entries in iov array */
    or_message msg;
    msg.msgtype = OR_OBJECTS;
    msg.u.objects.number = number;

    iov[0].iov_base = (caddr_t) &msg;
    iov[0].iov_len  = sizeof(or_message);

    iov[1].iov_base = (caddr_t) desc;
    iov[1].iov_len  = number * sizeof(or_objdesc);

    /* Handle objects */
    int iov_index = 1;
    caddr_t previous = 0;
    for (i = 0; i < number; i++) {
        
#if USE_SUBPAGE_PREFETCHING
	OR_obj *obj = pinfo->pref[i].obj_;
        Oref oref = pinfo->pref[i].oref;
#else
	MM_Handle* h = list->slot(start+i);
	OR_obj *obj = h->obj();
        Oref oref = h->oref();
#endif
	/* Find size */
	int size = OR_obj_full_size(obj);

	/* Set-up descriptor */
	desc[i].o = oref;
	desc[i].objsize = size;

        // Set-up iovec entries for object contents merging
        // contiguos objects in the same iovec.
        // It requires objects to be in increasing address order to
        // do a good job. This is ensured by the order in which
        // Itable::get_modifications returns modified objects in a
        // segment.  
        if (previous && (previous+iov[iov_index].iov_len == (caddr_t)obj)) 
	    iov[iov_index].iov_len += size*sizeof(OR_slot);
	else {
            iov[++iov_index].iov_base = (caddr_t) obj;
	    iov[iov_index].iov_len = size*sizeof(OR_slot);
            previous = (caddr_t) obj;
	}
    }

    /* Now send the collected data */
    or_fetch_time.stop();
    net->send_vector(iov, iov_index+1);
    net->flush();
}

