/* Copyright Barbara Liskov, MIT 1996 */

/*

NET.CC

This provides the interface between the cache subsystem and the OR(s)
storing objects persistently.  The procedures in this file should
be used only by other parts of the cache subsystem.

*/

// common includes
#include "common/bits.h"
#include "common/intarray.h"
#include "common/compat.h"
#include "common/fe_num.h"
#include "common/fe_or_msg.h"
#include "common/network.h"
#include "common/networkset.h"
#include "common/bufnet.h"
#include "common/or_obj.h"
#include "common/other_unix.h"
#include "common/prefetch.h"
#include "common/or_stat.h"
#include "common/th_assert.h"
#include "common/orefs.h"
#include "common/intset.h"
#include "common/locator.h"
#include "common/fail.h"
#include "common/xrefs.h"
#include "common/uids.h"
#include "common/unparser.h"
#include "common/Timer.h"

// fe includes
#include "fe/fe_config.h"
#include "fe/runtime/disphdr.h"
#include "fe/runtime/transinfo.h"
#include "fe/runtime/invalidation.h"

// cache includes
#include "fe/cache/cache.h"
#include "fe/cache/cache_internal.h"
#include "fe/cache/net.h"
#include "fe/cache/swiz.h"

// vdef includes
#include "config/vdefs/LAZY_SWIZZLING.h"
#include "config/vdefs/MAX_PREFETCHED.h"
#include "config/vdefs/ADAPT_PREFETCH.h"
#include "config/vdefs/ADAPT_PREFETCH_THR.h"
#include "config/vdefs/ADAPT_PREFETCH_INCR.h"
#include "config/vdefs/ADAPT_PREFETCH_UP_LIMIT.h"
#include "config/vdefs/ADAPT_PREFETCH_LOW_LIMIT.h"

// system includes
#include <stdio.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/time.h>
#include <string.h>

#include "common/mdebug.h"

extern int errno;

// Environment used by the network dispatch routine for object fetches
struct FetchEnv {
    Xref x; // Object being fetched
    bool was_fetched; // Set to TRUE by Mdispatch_OR_objects whn x has
    // been fetched
};


bool message_from_server; // Global variable. See net.h

static NetworkSet* nets = NULL;    // active connections
static prefetch_hint hint;	   // default prefetch hint 
static Locator *locator;           // OR locator
fe_num* fe_id = 0;                 // identifier of this FE

or_stat initial_stat;

// Signatures of private routines.

static Network* cache_set_up_connection (OR_num or_num);
// effects	sets up a connection with the OR and returns the Network*.
// errors	return 0 if it's not possible to set up a connection
//		with the given OR.

static bool cache_prim_fetch_root (OR_num or_num, Oref *result);
// modifies	"result"
// effects	fills in the oref of the root object at the OR.
//		Returns TRUE if all is well, FALSE otherwise.
  
static bool skip_data(Network* net, int bytes);
// effects	Read and discard specified number of bytes from "net".
//		Returns TRUE iff successful.

static obj fetch_object(Xref x, prefetch_hint* h);
// effects	Fetch object named by "x" into the cache.
//		Return the fetched object if successful.  Return 0
//		otherwise.

static bool Mdispatch_OR_objects(Network* net, OR_num or_num, int num,
				 void* env);
// requires	"net" is connection to the OR named by "or_num".
//		A "fetch-reply" message header for "num" objects
//		was just read over the network.
// effects	Read the objects in the rest of the "fetch-reply"
//		message and put them in the cache.  Returns TRUE
//		iff successful.
//              env is the environment used by the routine for determining
//              if the desired object was fetched.

static OR_desc* get_desc_ptr(OR_desc* ods, int orsetsize, int or);
    // effects: Search ods for the descriptor for server or and return a
    //          pointer to it. If none exists for or, returns NULL


static bool Mdispatch_Invalidation(Network* net, OR_num or_num, int num,
				   ubits32 msg_start, ubits32 msg_end,
				   Msg_Wait_Type wait_type);
// requires     net corresponds to the network connection between or_num
//              and this frontend.
// effects      Retrieves num invalid orefs from the network and invalidates
//              objects corresponding to those oref.
//              msg_start and msg_end indicate the range of invalidation
//              messages that the orefs correspond to
//              wait_type is used for debugging purposes
//              Returns true if succeeds in getting the orefs (and
//              transaction does not abort)

static void print_newxu (Xrefs* xrefs, Uids* uids);
// effects	Print out xrefs and uids for new objects for debugging.

static bool net_single_message(OR_num or, IntSet* block_servers, OR_desc *od,
			       int &cur_wait_servers, void* env,
			       Msg_Wait_Type wait_type);
// requires: Data is available on the connection to OR or
//           od is NULL or points to the description info for or
// modifies: cur_wait_servers, block_servers, env
// effects:  cur_wait_servers indicates the number of servers that we
//           have to still wait for. Reads data from or and decrements
//           cur_wait_servers by 1 and removes the or from block_servers
//           if this was a message that the FE was waiting for.
//           Processes the message by dispatching to
//           the appropriate routine (env is used that routine for
//           obtaining the relevant environment state. This is the
//           procedure where handling of ALL messages from the server
//           should be done. This procedure will not perform a longjmp.
//           wait_type is used for debugging purposes to determine where the
//           call is from

// returns:  TRUE if it was succesful in reading the data

// Exported routines

void init_net() {
  nets = new NetworkSet;
  if (FEConf->max_prefetch >= 0)
    hint.max_prefetch = FEConf->max_prefetch;
  else
    hint.max_prefetch = MAX_PREFETCHED;   
  fe_id = new fe_num(gethostid(), getpid(), time(0), 0);
  unparser unp(0);
  fe_id->unparse(&unp);
}

OR_num init_locator(struct sockaddr_in *OR_spec) 
{
    ubits32 address;
    int port_offset;

    // Create OR locator
    locator = new Locator;
    if (!locator->init())
	return 0;

    address = OR_spec->sin_addr.s_addr;
    port_offset = ntohs(OR_spec->sin_port) - OR_DEFAULT_FE_PORT;
    return locator->or_number(address, port_offset);
}

Network* cache_get_OR_conn (OR_num or_num) {
    Network* net = nets->fetch(or_num);
    if (!net)
	net = cache_set_up_connection(or_num);
    return net;
}

void catch_sigio(int dummy) {
    // This variable is set over here and examined at the end of each
    // client operation
    message_from_server = TRUE;
}



obj cache_fetch_root (OR_num or_num, prefetch_hint* h) {
  Oref oref;
  if (! cache_prim_fetch_root(or_num, &oref)) return 0;

  /* see if we have the object */
  Xref xref;
  xref.or = or_num;
  xref.oref = oref;
  obj root = swiz_get_object(xref);
  if (root != 0) return root;

  /* fetch the object from the OR */
  return cache_wait_for(xref, h);  
}

bool read_from_net(OR_desc* ods, int orsetsize, Block_Option option,
		   bool* aborted, bool allow_longjmp, void* env,
		   Msg_Wait_Type wait_type) {
    // Later should special case single server case if needed for efficiency

    // The servers on which this procedure should block
    IntSet block_servers;
    block_servers.predict(orsetsize);
    for (int i = 0; i < orsetsize; i++)
	block_servers.insert(ods[i].or);

    *aborted = FALSE;

    // Current number of current servers for which we have to block
    int current_wait_servers = (option == FE_NO_BLOCK)? 0: orsetsize;

    // Check all the OR connections
    while (1) {
	int timeout = (current_wait_servers == 0)? 0: -1;
	IntArray* server_nums;
	int retval = nets->readselect(timeout, server_nums);
	if (retval < 0) return FALSE;
	if (retval == 0 && current_wait_servers == 0) {
	    // No more messages from server and FE does not have to wait
           break;
	}

	// Now see which servers have messages on them and receive them
	// Pick one message from each one of them
	for (int i = 0; i < server_nums->size(); i++) {
	    int ornum = server_nums->slot(i);
	    // Find the OR descriptor corresponding to this ornum (if any)
	    OR_desc* od = NULL;
	    if (current_wait_servers)
		od = get_desc_ptr(ods, orsetsize, ornum);
	    bool success =
		net_single_message(ornum, &block_servers, od,
				   current_wait_servers, env, wait_type);
	    if (!success) return FALSE;
	}

	// Variable set to FALSE.  If a SIGIO is recieved at this
	// point, the boolean will be FALSE at the beginning of the loop
	// but the select statement will detect the message
	message_from_server = FALSE;
    } // while (1) loop

    *aborted = Fe_Trans->must_abort();
    if (*aborted)
	Fe_Trans->abort_transaction(allow_longjmp);
    // This statement may or may not be reached
    return TRUE;
}

obj cache_wait_for(Xref x, prefetch_hint* h) {
    // Fetch the object into the cache.
    obj o = fetch_object(x, h);

#if LAZY_SWIZZLING
    // The fetching code may not have swizzled the object.
    // Check for it and do it ourselves.
    if (is_unswizzled(o))
	cache_swizzle_object(x.or, o);
#endif

    return o;
}


// Private routines

static bool read_new_object_info(Network* net, int num, Xrefs* x, Uids* u) {
    // requires OR_COMMITTED message has been just received on net
    // modifies	xrefs, uids
    // effects	Decodes "num" xrefs followed by "num" uids from the
    //		network.  Returns true iff successful.
    x->clear();
    u->clear();
    x->_enlarge_by(num);
    u->_enlarge_by(num);

    // Set up an iov to read in the two arrays
    struct iovec iov[2];
    iov[0].iov_base = (caddr_t) x->as_pointer();
    iov[0].iov_len  = sizeof(Xref) * num;
    iov[1].iov_base = (caddr_t) u->as_pointer();
    iov[1].iov_len  = sizeof(OR_uid) * num;

    return (net->recv_vector(iov, 2));
}

static bool Mdispatch_Commit_Result(Network* net, or_message* reply, void* env,
				    OR_desc* od, int cur_wait_servers) {
    // requires: An Transaction result message has been received on net
    //           and it was epxected from this OR
    // effects: new_num is the number of xrefs/uids of new objects.
    //          un is the place where the result of the commit may be placed.
    //          Receives the new xrefs/uids from net and processes them
    //          completely. The transaction is committed and its state is
    //          cleaned up. After this point, any work done belongs to the
    //          next transaction.
    //          cur_wait_servers indicates the number of servers that we
    //          still have to wait for
    //          Returns FALSE id there was a network error. Else returns TRUE

    // XXX Need to match tids to ensure that the FE got the message it was
    // expecting. 

    TransEnv* tenv = (TransEnv*) env;
    if (tenv->read_only) {
	tenv->committed = tenv->committed && reply->msgtype == OR_READ_OK;
    } else if (reply->msgtype == OR_COMMITTED) {
	// Reply received from coordinator
	int num = reply->u.commit.count;
	bool ok = read_new_object_info(net, num, tenv->xrefs, tenv->uids);
	if (!ok) {
	    tenv->committed = FALSE;
	    Fe_Trans->complete_transaction(tenv);
	    return FALSE;
	}
	tenv->committed = TRUE;
    } else
	tenv->committed = FALSE;

    // If expecting from from more servers, then should not finish off the
    // transaction. 
    if (cur_wait_servers != 0)
	return TRUE;
    
    Fe_Trans->complete_transaction(tenv);
    if (FEConf->debug_level > 1)
	print_newxu(tenv->xrefs, tenv->uids);

    return TRUE;
}

static bool net_single_message(OR_num or, IntSet* block_servers, OR_desc *od,
			       int &cur_wait_servers, void* env,
			       Msg_Wait_Type wait_type) {
    // Important Convention:
    // For a message received from a server, a dispatch function
    // with the prefix "Mdispatch_*"  must be written that processes the
    // message *completely* (you need not write the function if it is a one
    // or two statements. Skipping of data should be done here itself (as in
    // OR_COMMITTED). Note that you should not just receive the message and
    // handle it later since asynchronous messages may be received from the
    // servers and then messages may get processed out of order
    // A sanity check is that after your call to read_from_net, there
    // should not be any processing that would be affected by the reception
    // of other messages

    bool expecting_message = FALSE;
    Network* net = nets->fetch(or);
    th_assert(net, "Network set does not contain network");
    or_message reply;
    if (! reply.decode(net)) {
	perror("reading from OR connection");
	return FALSE;
    }
    
    // Check if the caller wanted to wait on this server and the relevant
    // message was received
    if (cur_wait_servers && block_servers->contains(or) && od) {
	ubits32* msgs = od->msgtype;
	for (int j = 0; j < od->size; j++) {
	    expecting_message = reply.msgtype == msgs[j];
	    if (expecting_message) {
		block_servers->remove(or);
		cur_wait_servers--;
		break;
	    }
	}
    }

    int num, msg_start, msg_end; // declared here to make cxx happy
    bool result = TRUE;
    switch(reply.msgtype) {
	case OR_ROOT:
	    if (expecting_message) {
		Oref* root = (Oref *)env;
		*root = reply.u.root;
	    }
	    break;
	case OR_OBJECTS:
	    result = Mdispatch_OR_objects(net, or,
					  reply.u.objects.number, env);
	    break;
	case OR_COMMITTED:
	    if (! expecting_message) {
		// Were not expecting this message. Skip the data
		num = reply.u.commit.count;
		result = skip_data(net, sizeof(Xref)*num + sizeof(OR_uid)*num);
		break;
	    }
	    // Note: No break statements since all of these require the
	    // same processing
	case OR_READ_OK:
	case OR_STALEABORT:
        case OR_FAILABORT:
	    if (expecting_message) {
		result  = Mdispatch_Commit_Result(net, &reply, env, od,
						  cur_wait_servers);
	    }
	    break;
	case OR_INVALIDATION:
	    // Asynchronous message
	    num = reply.u.invalid.count;
	    msg_start = reply.u.invalid.msg_start;
	    msg_end = reply.u.invalid.msg_end;
	    result = Mdispatch_Invalidation(net, or, num, msg_start,
					    msg_end, wait_type);
	    break;
	case OR_STAT:
	    or_stat stat;
	    or_stat* statp;
            if (!expecting_message)
	       // Read and discard the "or_stat" structure sent here
	       statp = &stat;
            else
	       statp = (or_stat*) env;
            result = statp->decode(net);
	    break;
	default:
	    fprintf(stderr, "Received Message %d from OR %d\n",
		    reply.msgtype, or);
	    th_fail("Bad message from OR");
	    break;
    }
    if (!result)
	fprintf(stderr, "Error while receivinng data from OR\n");
    return result;
}

static OR_desc* get_desc_ptr(OR_desc* ods, int orsetsize, int or) {
    // effects: Search ods for the descriptor for server or and return a
    //          pointer to it. If none exists for or, returns NULL
    for (int i = 0; i < orsetsize; i++) {
	if (ods[i].or == or)
	    return &ods[i];
    }
    return NULL;
}

static bool Mdispatch_Invalidation(Network* net, OR_num or_num, int num,
				   ubits32 msg_start, ubits32 msg_end,
				   Msg_Wait_Type wait_type) {

    // FE should not ack immediately but it should wait for some time
    th_assert(num > 0, "No. of invalidation orefs are less than 1");
    Oref *inv_orefs = new Oref[num];
    
    if (net->recv_buffer(inv_orefs, num*sizeof(Oref))) {
	// We would like to add "piggybacking" later. But currently we
	// ack the invalidation message immediately.
	// Sending the ack before the real handling of the message
	// This should be ok
	fe_message ack;
	ack.msgtype = FE_INVALID_ACK;
	ack.u.invalid_ack.last_message_seen = msg_end;
	bool netans = ack.encode(net);
	th_assert(netans, "Failed in sending an invalidation ack");
    }
    else {
	fprintf(stderr, "Error while receiving invalidation message\n");
	return FALSE;
    }

    if (FEConf->debug_level > 0) {
	fprintf(stderr, "Received Inv message: OR: %d S: %d E:%d Num: %d\n",
		or_num, msg_start, msg_end, num);
    }

    Fe_Trans->invalidate_objects(or_num, inv_orefs, num, msg_start,
				 msg_end, wait_type);
    delete [] inv_orefs;
    return TRUE;
}

static Network* cache_set_up_connection (OR_num or_num) {
    Network* net = locator->make_connection(or_num, OR_DEFAULT_FE_PORT);
    if (net == 0)
	return 0;

    // Send our FE number
    if (fe_id->encode(net) && net->flush()) {
	// Connection succeeded. Allow SIGIO to be delivered to this
	// process whenever a message is received from the OR
	nets->add(or_num, net);
	if (FEConf->allow_sigio)
	    net->allow_sigio();
	return net;
    }
    delete net;
    return 0;
}

static bool cache_prim_fetch_root (OR_num or_num, Oref *result) {  
    Network* net = cache_get_OR_conn(or_num);
    if (net == 0) return FALSE;
    
    // send message
    fe_message msg;
    msg.msgtype = FE_FETCHROOT;
    if (! (msg.encode(net) && net->flush())) {
	perror("writing on OR connection");
	return FALSE;
    }

    OR_desc od;
    bool aborted;
    ubits32 msgtype = OR_ROOT;
    od.or = or_num;
    od.msgtype = &msgtype;
    od.size = 1;
    bool ok = read_from_net(&od, 1, FE_BLOCK, &aborted, TRUE,
			    result, WAIT_FETCH);
    return ok;
}

static bool skip_data(Network* net, int bytes) {
    static const int bufsize = 1024;
    static char buf[bufsize];

    while (bytes > 0) {
	int count = (bufsize < bytes) ? bufsize : bytes;
	if (!net->recv_buffer(buf, count)) return FALSE;
	bytes -= count;
    }

    return TRUE;
}



#if ADAPT_PREFETCH
// Statistics used for the adaptive prefetch group size scheme.

static p_nfetched=0; // Statistic on the number of objects prefetched.

long p_nused=0;      // Statistic on the number of objects prefetched that 
                     // is actually used.


static inline void update_fetch_stats(long fetched) {
  // Dividing the old values by 2 provides exponential forgetting
  // of the history. 
  p_nfetched = fetched + p_nfetched/2;
  p_nused = 1 + p_nused/2;
}


static int estimate_prefetch_size(void) {
  static int psize = 0; // Adapted prefetch group size.
  
  if (!psize) {
    psize= (FEConf->max_prefetch >= 0) ? FEConf->max_prefetch : MAX_PREFETCHED;
    return psize;
  }

  // The prefetch quality is calculated as the ratio between the number of
  // objects used and the number of objects fetched.
  float prefetch_quality = (float)p_nused/(float)p_nfetched;

  if (prefetch_quality > ADAPT_PREFETCH_THR)
    // If the prefetch_quality is higher than ADAPT_PREFETCH_THR
    // then the prefetch group size is incremented...
    psize = psize + ADAPT_PREFETCH_INCR;
  else
    // Otherwise it is decremented.
    psize = psize - ADAPT_PREFETCH_INCR;

  // Limit the output.
  if (psize < ADAPT_PREFETCH_LOW_LIMIT) psize = ADAPT_PREFETCH_LOW_LIMIT;
  if (psize > ADAPT_PREFETCH_UP_LIMIT) psize = ADAPT_PREFETCH_UP_LIMIT;

  return psize;
}
#endif
     
// Measure elapsed time in fetch (including network+cost at or+initial processing
// of fetched objects)
Timer fe_fetch_time;
Timer fe_net_fetch_time;
int no_fetches = 0;

extern int gc_occurred; // From gc.cc

static obj fetch_object(Xref x, prefetch_hint* h) {
    if (h == 0) h = &hint;

    fe_fetch_time.start();
    fe_net_fetch_time.start();

#if ADAPT_PREFETCH
    hint.max_prefetch = estimate_prefetch_size();
#endif

    Network* net = cache_get_OR_conn(x.or);
    if (net == 0) return 0;

    FetchEnv fenv;
    fenv.x = x;
    fenv.was_fetched = FALSE;
    fe_message msg;
    msg.msgtype = FE_FETCH;
    msg.u.fetch.o = x.oref;
    msg.u.fetch.prefetch = *h;
    msg.u.fetch.clear_pref_set = gc_occurred;
    // Clear gc_occured flag. We have already informed the OR that another GC
    // occurred.
    gc_occurred = 0;

    obj result = 0;
    bool first_fetch = TRUE;
    // Watch this object. If it has been fetched and invalidated,
    // we need to send the fetch request again
    while (result == 0) {
	if (first_fetch || fenv.was_fetched) {
	    if (! (msg.encode(net) && net->flush())) return 0;
	    STATS(stats->fetches_sent++);
	    if (FEConf->debug_level > 0 && !first_fetch) {
		fprintf(stderr, "Object %d:%d.%d was fetched and shrunk\n",
			x.or, OREF_SEGMENT(x.oref), OREF_INDEX(x.oref));
	    first_fetch = fenv.was_fetched = FALSE;
	    }
	}
	OR_desc od;
	bool aborted;
	ubits32 msgtype = OR_OBJECTS;
	od.or = x.or;
	od.msgtype = &msgtype;
	od.size = 1;
	bool ok = read_from_net(&od, 1, FE_BLOCK, &aborted, TRUE,
				&fenv, WAIT_FETCH);
	if (ok == 0) break;
	// See if object has appeared in the cache
	result = swiz_get_object(x);
    }

    no_fetches++;
    fe_fetch_time.stop();
    return result;
}

// XXX Currently, we read the message first into a static
// buffer and then copy into the cache.  We should arrange
// things to read directly into the cache and avoid the
// extra copying.

static or_obj_descriptors* descs = 0;
static OR_slot_Array*      slots = 0;

static void init_buffers() {
    // requires	Arrays have not been created.
    // effects	Creates the arrays.

    descs = new or_obj_descriptors(128);
    slots = new OR_slot_Array(128 * 8);	// Assume 8 slots per object?
}

static void reset_buffers() {
    // requires	Arrays have been created
    // effects	Shrinks them down to a useful size

    // XXX Currently we allow the buffers to grow arbitrarily.
}



static bool Mdispatch_OR_objects(Network* net, OR_num or_num,
				 int num, void* env) {
    if (descs == 0) init_buffers();

    FetchEnv *fenv = (FetchEnv *) env;
    // Read in the descriptors
    descs->clear();
    descs->_enlarge_by(num);
    if (!net->recv_buffer(descs->as_pointer(), sizeof(or_objdesc)*num)) {
	reset_buffers();
	return FALSE;
    }

    // Handle the fetched objects.
    cache_enter_fetched_objects(net, num, or_num, descs->as_pointer());

    subpage_fifo.insert(or_num, num, descs);
    descs = new or_obj_descriptors(128);

#if ADAPT_PREFETCH
   update_fetch_stats(num);
#endif

    return TRUE;
}

bool get_or_stats(OR_num or_num, or_stat& stat) {

    Network* net = cache_get_OR_conn(or_num);
    if (net == 0) {
	fprintf(stderr, "Failed to get the  OR connection\n");
	return FALSE;
    }
    
    // send message
    fe_message msg;
    msg.msgtype = FE_STAT;
    if (! (msg.encode(net) && net->flush())) {
	perror("writing on OR connection");
	return FALSE;
    }

    // Wait for reply;
    OR_desc od;
    bool aborted;
    ubits32 msgtype = OR_STAT;
    od.or = or_num;
    od.msgtype = &msgtype;
    od.size = 1;
    bool ok = read_from_net(&od, 1, FE_BLOCK, &aborted, TRUE,
			    &stat, WAIT_FETCH);
    if (!ok) {
	fprintf(stderr, "Could not get statistics\n");
	return FALSE;
    }
   return TRUE;
}

static void print_newxu (Xrefs* xrefs, Uids* uids) {
    fprintf (stderr, "    New Xrefs and Uids: Count = %d\n", xrefs->size());

    int nsize = xrefs->size();
    for (int i=0;i<nsize;i++) {
	Xref x = xrefs->slot(i);
	OR_uid u = uids->slot(i);

	fprintf (stderr, "        Xref = %d::%d:%-d, Uid = %u\n", 
		 x.or, OREF_SEGMENT(x.oref), OREF_INDEX(x.oref), u);
    }
}
