// Copyright 1995 Barbara Liskov

/*
\section{Transaction Manager Implementation}

To do:
\begin{itemize}
\item Make sure MOS, NOS, and participant set always freed.
\item Periodically clean out old entries from tstatus (add a timeout).
\end{itemize}
*/

#define MILLI_SEC 1000

// This constant gives the number of microsecs, the stable threshold
// is kept above the threshold on initialization.
#define  THRESHOLD_DELTA  (5000 * MILLI_SEC)

// 10 msec delay
#define MAX_NETWORK_DELAY  (10*MILLI_SEC)
// Max Clock skew of 10 sec. These values will be dynamically
// changed in the implementation later XXX
#define MAX_CLOCK_SKEW  (10000*MILLI_SEC)

// Maximum number of microseconds that the incoming
// transaction is allowed to be in the future. Currently, it
// is 300 seconds
#define MAX_IN_FUTURE  (300000*MILLI_SEC)

// Minimum time between truncations of coordinator set
#define TRUNCATE_INTERVAL (10000 * MILLI_SEC) 

// Clock skew of 10 msec

#include "common/fe_or_msg.h"
#include "common/or_obj.h"
#include "common/Timer.h"
#include "common/or_set.h"
#include "common/ros.h"
#include "common/mos.h"
#include "common/nos.h"
#include "common/xrefs.h"
#include "common/uids.h"
#include "common/transaction.h"
#include "common/tstampgen.h"
#include "common/tstamp.h"
#include "common/unparser.h"
#include "mm/handle.h"
#include "mm/mm.h"
#include "mm/log.h"
#include "mm/logrecord.h"
#include "mm/rinfo.h"

#include "tm.h"
#include "vqueue.h"
#include "or_config.h"
#include "or.h"
#include "fe_manager.h"
#include "or_manager.h"
#include "fe_table.h"
#include "inv_set.h"
#include "fe_info_set.h"
#include "tstatus.h"
#include "thread.h"
#include "or_or_msg.h"
#include "update.h"
#include "coord_set.h"

extern float total_time, recv_time, send_time, validate_time;
extern Timer total_timer, recv_timer, send_timer, validate_timer;
extern Timer cumm_total_timer, cumm_recv_timer, cumm_send_timer, cumm_validate_timer;


TM::TM() {
    tgen = new Tstamp_Generator;
    Tstamp now = tgen->generate();

    // Use uninitialized FE number in initial threshold
    Global_Tstamp stamp = Global_Tstamp( fe_num(), now );
    threshold = stamp;

    last_trunc = now;

    stable_threshold = threshold + THRESHOLD_DELTA;
    to_install_stable_threshold = stable_threshold;
    vq = new Validate_queue;
    mutex_validation = new Mutex;
    coord_set = new Coord_set;
    xactions = 0;

    // Log fake commit record to make threshold persistent
    Tid dummy;		// Anything wrong with using an uninitialized Tid??

    Log_Record* r = new Commit_Log_Record(dummy, to_install_stable_threshold);
					  
    Log_Index l = or->log->append(r);

    or->log->flush(l);
    or->log->installed(l);

    // Create map from our OR number to offset 0, so that we don't have to
    // create this every time we commit a single OR transaction.
    single_or_index.store(or->config->ornum, 0);
}

TM::~TM (){
    delete vq;
    delete mutex_validation;
    delete coord_set;
}

// \subsection{Commit for single OR}

ubits32 TM::commit_single(FE_manager* fe, Transaction const* tx,
			  Xrefs* x, Uids* u) {
    // Validation of this transaction, update of threshold and the
    // addition of objects to the invalid object sets of other FEs has to
    // be done atomically using a mutex. Otherwise, non-serializable
    // conditions can occur.

    mutex_validation->grab(); // released in log_commit_single or below

    ubits32 res = OR_COMMITTED;
    Global_Tstamp tx_tstamp(tx->tid);

    cumm_validate_timer.start();
    validate_timer.start();
    Rinfo* r = validate (fe, tx, tx_tstamp, x, u);
    if (r != 0) {
	xactions++;

	if (tx->mos->count() == 0) {
	    // Read-only transactions are automatically "installed"
	    bool found = vq->mark_installed(&tx->tid);
	    th_assert(found, "Transaction not found in VQ");
	}

	// Need to log if transaction is read/write, or if stable_threshold
	// changes. Invalid sets are also updated after installation
	if (update_threshold(tx_tstamp) || tx->mos->count()) {
	    // This procedure releases validation mutex
	    log_commit_single(tx, r, x, fe);
	}
        else {
	    // Read-only transaction and threshold does not need updating
	    delete r;
	    mutex_validation->release();
	}
    } else {
	mutex_validation->release();
	// Transaction aborted.  Log abort record.
	log_abort(tx->tid);
	vq->remove_trans(&tx->tid);
        // Must release space for NOS/MOS
	delete tx->nos;
	delete tx->mos;
	res = OR_STALEABORT;        // For the moment
    }
    truncate_vqueue_if_needed();
    validate_timer.stop();  validate_time = validate_timer.elapsed();
    cumm_validate_timer.stop();
    return res;
}

// \subsection{Commit for multiple ORs}

ubits32 TM::prepare_coord(FE_manager* fe, Transaction const* tx,
			  Xrefs* x, Uids* u) {
    Global_Tstamp tx_tstamp(tx->tid);

    // Check if the transaction has already been aborted by a participant.
    int status = or->tstatus->coordinator_add(tx->tid, tx->participants, 
					      fe->id());
    if (status == STATUS_ABORT) {
	abort(tx->tid);
	delete tx->mos;
	delete tx->nos;
	return OR_STALEABORT;
    }

    mutex_validation->grab(); 

    Rinfo* r = validate (fe, tx, tx_tstamp, x, u);
    if (r == 0) {
	// Transaction aborted. 
	mutex_validation->release();

	abort(tx->tid);

        // Must release space for NOS/MOS
	delete tx->nos;
	delete tx->mos;
	return OR_STALEABORT;
    }

    // This procedure releases the validation mutex
    Log_Index index = log_committing(tx, r, x);
    or->tstatus->set_log_index(tx->tid, index);
    
    // See if transaction is read-only at this OR
    if (tx->mos->count() == 0)
	status = or->tstatus->vote_ok(tx->tid, or->config->ornum, 0, 0);
    else status = or->tstatus->vote_ok(tx->tid, or->config->ornum, x, u);
    
    // If all votes are in, commit.
    if (status == STATUS_COMMIT) {
	commit(tx->tid);
	return OR_COMMITTED;
    } 

    return 0;
}

ubits32 TM::prepare_part(FE_manager* fe, Transaction const* tx,
			 Xrefs* x, Uids* u, bool &force) {
    Global_Tstamp tx_tstamp(tx->tid);

    mutex_validation->grab();

    Rinfo* r = validate (fe, tx, tx_tstamp, x, u);
    if (r == 0) {
	// Transaction aborted
	mutex_validation->release();	

	log_abort(tx->tid);

        // Must release space for NOS/MOS
	delete tx->nos;
	delete tx->mos;
	return OR_STALEABORT;
    }

    // Only log read-only transaction if it increases stable threshold.
    if (update_threshold(tx_tstamp))
	force = TRUE;
    else force = FALSE;

    // XXX Seem to be forcing too often--don't need to force when read/write
    // transaction increases threshold, since threshold is in prepare record
    // sent to coordinator.
    bool read_write = tx->mos->count() != 0;
    if (force || read_write) {
	// This procedure releases validation mutex & sets force.
	Log_Index index = log_prepared(tx, r, x, force);

	if (read_write)
	    or->tstatus->participant_add(tx->tid, index, fe->id());
	else or->log->installed(index);
    }
    else mutex_validation->release();

    return OR_COMMITTED;
}

void TM::abort(Tid const& tid) {
    // Log abort record
    log_abort(tid);

    // Send abort messages to other participants who have responded.
    // If we aren't coordinator for transaction, no messages will be sent.
    or_or_message msg;
    msg.msgtype = OR_ABORT;
    msg.tid = tid;
    
    or->tstatus->lock(); {
	Transaction_Status::Elements elts(or->tstatus, tid);
	
	for (; elts.ok(); elts.next())
	    if (elts.or_num() != or->config->ornum)
		or->or_managers->send_message(elts.or_num(), &msg);
    } or->tstatus->unlock();

    // Cancel object reservations, if they were made.
    Log_Index index = or->tstatus->get_log_index(tid);
    // XXX Is this cast safe?  Is log's fetch operation sufficiently powerful?
    Data_Log_Record *dr = (Data_Log_Record *) or->log->fetch(index);
    if (dr != 0) {
	dr->abort();
	dr->install();
	or->log->installed(index);
    }

    // Remove entry from tstatus
    or->tstatus->remove(tid);

    truncate_vqueue_if_needed();
}

void TM::commit(Tid const& tid) {
    OR_set participants;
    OR_num or_num;

    xactions++;

    // Send commit message to all participants (except us)
    or_or_message msg;
    msg.msgtype = OR_COMMIT;
    msg.tid = tid;

    // Find participants of transaction while we still know votes
    or->tstatus->lock(); {   
	Transaction_Status::Elements elts(or->tstatus, tid);
    
	for (; elts.ok(); elts.next())
	    participants.add(elts.or_num());
    } or->tstatus->unlock();    
    
    participants.remove(or->config->ornum);
    
    // Get log index of prepare record before we move to phase 2
    Log_Index l = or->tstatus->get_log_index(tid);

    // Put transaction into phase 2 in tstatus (BEFORE telling participants),
    // and get xrefs and uids of new objects.
    msg.u.commit.index = new OR_Index;
    bool read_only = or->tstatus->committed(tid, msg.u.commit.xrefs, 
					    msg.u.commit.uids, 
					    msg.u.commit.index);

    // Log committed record
    log_committed(tid, msg.u.commit.xrefs, msg.u.commit.index, &participants);

    msg.u.commit.count = msg.u.commit.xrefs->size();

    OR_set::Elements gen(&participants);
    
    while (gen.get(or_num)) 
	or->or_managers->send_message(or_num, &msg);

    // Start update phase thread if necessary
    if (read_only)
	or->log->installed(l);
    else start_update(l, &msg);

    truncate_vqueue_if_needed();
}

// \subsection{Validation operations}

bool TM::threshold_check (Transaction const* tx, Global_Tstamp& tstamp) {
    // XXX Current hack
    return TRUE;
    if (tstamp < threshold) return FALSE;
    assert (tstamp > threshold); // Should not be equal
    return TRUE;
}

bool TM::invalid_obj_check (FE_manager* fe, Transaction const* tx) {
    Ros::Elements rgen(tx->ros);
    Mos::Elements mgen(tx->mos);

    Oref oref;
    OR_obj* obj;
    FE_info *fe_info = fe->fe_info();
    bool valid = TRUE;

    fe_info->lock->read_lock();

    // When invalid set is empty, do not bother looping over ros/mos
    if (!fe_info->invalid_objs->empty()) {
	while (valid && rgen.get(oref)) {
	    if (fe_info->invalid_objs->member(oref))
		valid = FALSE;
	}

	while (valid && mgen.get(oref, obj)) {
	    if (fe_info->invalid_objs->member(oref))
		valid = FALSE;
	}
    }

    fe_info->lock->read_unlock();
    return valid;
}

Rinfo* TM::validate (FE_manager* fe, Transaction const* tx, 
		   Global_Tstamp& tx_tstamp, Xrefs* xrefs, Uids* uids) {

    Global_Tstamp cur_tstamp = Global_Tstamp(fe_num(), tgen->generate());

    if (cur_tstamp + MAX_IN_FUTURE < tx->tid)
	return 0;

    if (!threshold_check (tx, tx_tstamp)) {
	if (or->config->debug_level > 0) {
	    fprintf(stderr, "Failed threshold check\n");
	}
	// Failed the threshold test.
	return 0;
    }
    if (!invalid_obj_check(fe, tx)) {
	// Failed the invalid object test.
	if (or->config->debug_level > 0) {
	    fprintf(stderr, "Failed invalid object check\n");
	}
	return 0;
    }
    if (!vq->vqueue_check (tx)) {
	// Failed the Validation Queue test.
	if (or->config->debug_level > 0) {
	    fprintf(stderr, "Failed VQ check\n");
	}
	return 0;
    }
    Rinfo *r = reserve_object_space (tx, xrefs, uids);
    if (r) {
 	// Add the transaction to the VQ
 	vq->add_vqueue(tx);
    }
    else
	if (or->config->debug_level > 0) {
	    fprintf(stderr, "Failed to allocate space\n");
	}

    return r;
}

Rinfo* TM::reserve_object_space (Transaction const* tx,
				 Xrefs* xrefs, Uids* uids)
{
    Rinfo* r = new Rinfo(tx->mos, tx->nos);
    if (!or->mm->reserve(r)) {
	r->mos = NULL; // To prevent them from being deleted here
	r->nos = NULL; // Will get deleted in TM:commit
	delete r;
	return 0;
    }

    // Assign uids to new objects
    OR_obj* obj;
    Nos::Elements ngen(tx->nos);
    int i = 0;
    while (ngen.get(obj)) {
	Xref x;
	x.oref = r->olist[i];
	x.or = or->config->ornum;
	xrefs->append(x);
	uids->append(0);

	i++;
    }

    return r;
}

void TM::add_to_invalid_sets (FE_manager *fe, const Oref* wset, int wsize) {
    FE_info* fe_info;
    FE_info_set* fe_info_set = or->fe_info_set;

    // The mutex_validation is already held by the caller.
    // Read lock the FE_info_set. For each FE_table, write lock it,
    // add the invalid objects and release the lock.
    // Not logging the invalid set.

    if (wsize == 0) return;

    fe_info_set->lock->read_lock();
    FE_info_set::Elements fgen(fe_info_set);

    while (fgen.get(fe_info)) {
	// Skip FE that made transaction; its objects are valid.
	// XXX This comparison should be done on the basis of FE numbers
	if (fe != NULL && fe_info == fe->fe_info()) continue;

	fe_info->lock->write_lock();
	Invalid_set *inv_set = fe_info->invalid_objs;
	Collection_num msg_num;
	bool created_collection = FALSE;

	for (int i = 0; i < wsize; i++) {
	    Oref oref = wset[i];
	    if (fe_info->fe_table->is_object_used(oref)) {
		// Create a new collection of invalid objects for this
		// frontend if necessary. Insert the oref in the invalid
		// set
		if (!created_collection) {
		    created_collection = TRUE;
		    msg_num = inv_set->new_collection();
		    fe_info->current_message = msg_num;
		    // Note: fe_info is locked so nobody could have
		    // changed the current message to a higher value.
		}
		inv_set->add_obj(oref, msg_num);
	    }
	}
	
	fe_info->lock->write_unlock();
    }        // end of while loop for all the FEs
    fe_info_set->lock->read_unlock();
}

void TM::add_to_FE_table (FE_manager *fe, Xrefs *xrefs) {
    if (fe == NULL || xrefs == NULL)
	return;

    // Find FE table
    FE_info_set* fe_info_set = or->fe_info_set;
    FE_info* fe_info;

    fe_info_set->lock->read_lock();
    FE_info_set::Elements fgen(fe_info_set);

    while (fgen.get(fe_info))
	// XXX This comparison should be done on the basis of FE numbers
	if (fe_info == fe->fe_info()) {
	    fe_info->lock->write_lock();
	    
	    for (int i = 0; i < xrefs->size(); i++) {
		fe_info->fe_table->add_object(xrefs->slot(i).oref);
	    }
	    
	    fe_info->lock->write_unlock();
	    break;
	}
    fe_info_set->lock->read_unlock();
}

bool TM::update_threshold(Global_Tstamp& tstamp) {
    // The threshold is not updated here. It is only updated when the
    // VQ is truncated.

    Global_Tstamp cur_tstamp = Global_Tstamp(fe_num(), tgen->generate());

    if (tstamp > stable_threshold) {
	// Currently, setting the threshold to be tstamp + DELTA
	to_install_stable_threshold = tstamp.max(cur_tstamp) + THRESHOLD_DELTA;
	return TRUE;
    }
    to_install_stable_threshold = stable_threshold;
    return FALSE;
}

void TM::stat(or_stat& s) {
    mutex_validation->grab(); {
	s.trans = xactions;
    } mutex_validation->release();
}

// \subsection{Logging operations}

void TM::log_commit_single(Transaction const* tx, Rinfo* r,
			   Xrefs *xrefs, FE_manager* fe)
{
    int mos_count = tx->mos->count();

    Data_Log_Record* dr = new Data_Log_Record(tx->tid, r);
    Log_Record* cr = new Commit_Log_Record(tx->tid, to_install_stable_threshold);

    // Patch up references to new objects
    dr->commit(xrefs, &single_or_index);

    Log_Index l1 = or->log->append(dr);
    Log_Index l2 = or->log->append(cr);
    mutex_validation->release(); 

    or->log->flush(l2);

    add_to_FE_table(fe, xrefs);

    dr->install();
    cr->install();
    or->log->installed(l1);
    or->log->installed(l2);

    // The objects have been installed. Can update the invalid
    // sets now. This is conservative in the sense that an FE
    // may have fetched the new version but its version will
    // be invalidated
    // Mark the transaction as installed in the VQ
    // Also set the stable_threshold value to the value that has been flushed
    mutex_validation->grab(); {
	if (mos_count > 0) {
	    int wsize;
	    const Oref* wset = or->tm->vq->get_write_set(&tx->tid, wsize);
	    th_assert(mos_count == wsize, "Bad size of MOS in VQ");
	    add_to_invalid_sets (fe, wset, wsize);
	}
	bool found = vq->mark_installed(&tx->tid);
	th_assert(found, "Transaction not found in VQ");
	set_stable_threshold(TRUE);
    } mutex_validation->release(); 
    
    if (or->config->debug_level > 1)
	printf("Logged commit_single record\n");
}

Log_Index TM::log_prepared(Transaction const* tx, Rinfo *r, Xrefs *xrefs,
			   bool &force) {
    Log_Record* pr = new Prepared_Log_Record(tx->tid, tx->coordinator, r,
					     to_install_stable_threshold,
					     xrefs);

    Log_Index l1 = or->log->append(pr);
    if (!coord_set->member(tx->coordinator)) {
	// Add log record for new coordinator
	Stamos_Record* sr = new Stamos_Record(tx->coordinator, TRUE);

	if (or->config->debug_level > 1)
	    printf("Logged Stamos add record\n");
	force = TRUE;
    }

    // Remove old entries from coordinator set.
    // XXX Is it safe to truncate here?  We hold validation mutex, but removals
    // are not flushed until after mutex is released.
    truncate_coordinators();
    mutex_validation->release(); 

    if (force) {
	or->log->flush(l1);
	
	// Set the stable_threshold value to what has been flushed
	set_stable_threshold(FALSE);
    }

    // If coordinator is new, must first force Stamos log record before 
    // adding to set, so that coordinator is contacted on recovery.
    coord_set->add(tx->coordinator, tgen->generate());

    if (or->config->debug_level > 1)
	printf("Logged prepared record\n");
    return l1;
}

Log_Index TM::log_committing(Transaction const* tx, Rinfo *r, Xrefs *xrefs) {
    Log_Record* cr = new Committing_Log_Record(tx->tid, r,
					       to_install_stable_threshold,
					       tx->participants, xrefs);

    Log_Index l1 = or->log->append(cr);
    mutex_validation->release(); 
    // or->log->flush(l1);

    // Set the stable_threshold value to what has been flushed
    set_stable_threshold(FALSE);
    if (or->config->debug_level > 1)
	printf("Logged committing record\n");
    return l1;
}

void TM::log_committed(Tid const &tid, Xrefs *xrefs, OR_Index *index, 
		       OR_set *participants) {
    Log_Record* cr = new Committed_Log_Record(tid, xrefs, index, participants);

    Log_Index l = or->log->append(cr);
    or->log->flush(l);

    cr->install();
    or->log->installed(l);

    if (or->config->debug_level > 1)
	printf("Logged committed record\n");
}

void TM::log_abort(Tid const &tid) {
    Log_Record* ar = new Abort_Log_Record(tid);
    Log_Index l = or->log->append(ar);          
    ar->install();
    or->log->installed(l);

    if (or->config->debug_level > 1)
	printf("Logged abort record\n");
}

void TM::log_done(Tid const &tid) {
    Log_Record* dr = new Done_Log_Record(tid);
    Log_Index l = or->log->append(dr);          
    dr->install();
    or->log->installed(l);

    if (or->config->debug_level > 1)
	printf("Logged done record\n");
}

void TM::log_participant(Tid const &tid, OR_num participant, 
			 Prepared_Log_Record *rec, Log_Index index) {
    Log_Record* pr = new Participant_Log_Record(tid, participant, rec, index);
    Log_Index l = or->log->append(pr);          
    pr->install();
    or->log->installed(l);

    if (or->config->debug_level > 1)
	printf("Logged participant prepare record for OR %d\n", participant);
}


void TM::set_stable_threshold(bool mutex_held) {
    // requires: mutex_validation has not been grabbed
    //           Called after the log has been flushed
    // effects: Sets the stable_threshold to the value that is on disk

    // This is the only procedure where the stable_threshold is set
    if (!mutex_held)
	mutex_validation->grab();
    // Other threads may have bumped up the stable_threshold already
    // So check if its value is still less than to_install_stable_threshold
    if (stable_threshold < to_install_stable_threshold) {
	stable_threshold = to_install_stable_threshold;
    }
    if (!mutex_held)
	mutex_validation->release();
}

void TM::mark_installed(Global_Tstamp* ts) {
    // requires: Transaction corresponding to ts exists
    //           hould not be called by an internal routine of TM
    // effects:  Marks the corresponding transaction as
    //           installed
    
    mutex_validation->grab(); {
	bool found = vq->mark_installed(ts);
	th_assert(found, "Transaction not found in VQ (in background)");
    }
    mutex_validation->release();
}

void TM::truncate_vqueue_if_needed() {
    // effects: Remove "old" entries from the VQ
    
    // XXX Currently not taking the skew into account.

    Global_Tstamp cur_tstamp = Global_Tstamp(fe_num(), tgen->generate());
    Global_Tstamp new_threshold = cur_tstamp + (-MAX_CLOCK_SKEW)
	+ (-MAX_NETWORK_DELAY);
    mutex_validation->grab(); {
	if (new_threshold < stable_threshold) {
	    // To maintain the invariant:  stable_threshold >= threshold
	    int number = vq->truncate_vqueue(&new_threshold);
	    threshold = new_threshold;
	}
    }
    mutex_validation->release();
}

void TM::truncate_coordinators() {
    Tstamp now = tgen->generate();

    if (now < last_trunc + TRUNCATE_INTERVAL) 
	return;

    OR_set or_set;
    coord_set->truncate(now + (-TRUNCATE_INTERVAL), &or_set);
    last_trunc = now;

    // Log records for removed coordinators
    OR_set::Elements elts(&or_set);
    OR_num or_num;

    while (elts.get(or_num)) {
	Stamos_Record* sr = new Stamos_Record(or_num, FALSE);
	Log_Index l1 = or->log->append(sr);
	or->log->installed(l1);

	if (or->config->debug_level > 1)
	    printf("Truncating coordinator %d\n", or_num);
    }
}
