// Copyright 1995 Barbara Liskov

/*
\section{Transaction Manager Implementation}

To do:
\begin{itemize}
\item Make sure MOS, NOS, and participant set always freed.
\item Periodically clean out old entries from tstatus (add a timeout).
\end{itemize}
*/

#define MILLI_SEC 1000

// This constant gives the number of microsecs, the stable threshold
// is kept above the threshold on initialization.
#define  THRESHOLD_DELTA  (5000 * MILLI_SEC)

// 10 msec delay
#define MAX_NETWORK_DELAY  (10*MILLI_SEC)
// Max Clock skew of 10 sec. These values will be dynamically
// changed in the implementation later XXX
#define MAX_CLOCK_SKEW  (10000*MILLI_SEC)

// Maximum number of microseconds that the incoming
// transaction is allowed to be in the future. Currently, it
// is 300 seconds
#define MAX_IN_FUTURE  (300000*MILLI_SEC)

// Minimum time between truncations of coordinator set
#define TRUNCATE_INTERVAL (10000 * MILLI_SEC) 

// Clock skew of 10 msec

#include <iostream.h>
#include "utils/global_tstamp.h"
#include "utils/Timer.h"
#include "thread.h"

#include "common/or_message.h"
#include "common/or_address.h"
#include "common/or_obj.h"
#include "common/or_set.h"
#include "common/ros.h"
#include "common/modset.h"
#include "common/transaction.h"

#include "mm/handle.h"
#include "mm/mm.h"
#include "mm/log.h"
#include "mm/logrecord.h"

#include "tm.h"
#include "cachedir.h"
#include "vqueue.h"
#include "or_config.h"
#include "or.h"
#include "fe_manager.h"
#include "or_manager.h"
#include "inv_set.h"
#include "fe_map.h"
#include "tstatus.h"
#include "or_or_msg.h"
#include "update.h"
#include "coord_set.h"

float total_time, recv_time, send_time, validate_time;
Timer total_timer, recv_timer, send_timer, validate_timer;
Timer cumm_total_timer, cumm_recv_timer, cumm_send_timer, cumm_validate_timer;


TM::TM() {
    Tstamp now(TRUE);

    // Use uninitialized Address in initial threshold
    Global_tstamp stamp = Global_tstamp( Address(), now );
    threshold = stamp;

    last_trunc = now;

    stable_threshold = threshold + THRESHOLD_DELTA;
    to_install_stable_threshold = stable_threshold;
    vq = new Validate_queue;
    mutex_validation = new Mutex;
    coord_set = new Coord_set;
    xactions = 0;

    // Log fake commit record to make threshold persistent
    Tid tid;

    Log_Record* r = new Commit_Log_Record(tid, to_install_stable_threshold);
    
    Log_Index l = orx->log->append(r);

    orx->log->flush(l);
    orx->log->installed(l);
}

TM::~TM (){
    delete vq;
    delete mutex_validation;
    delete coord_set;
}

// \subsection{Commit for single OR or a global read only transaction}

Ubits32 TM::commit_local(FE_manager* fe, Transaction const* tx) {

    // Validation of this transaction, update of threshold and the
    // addition of objects to the invalid object sets of other FEs has to
    // be done atomically using a mutex. Otherwise, non-serializable
    // conditions can occur.

    mutex_validation->grab(); // released in log_commit_single or below

    Global_tstamp tx_tstamp(tx->tid);

    cumm_validate_timer.start();
    validate_timer.start();
    Ubits32 result = validate (fe, tx, tx_tstamp);
    if (result == OR_committed) {
	xactions++;

	if (tx->mos->count() == 0) {
	    // Read-only transactions are automatically "installed"
	    bool found = vq->mark_installed(&tx->tid);
	    th_assert(found, "Transaction not found in VQ");
	}

	// Need to log if transaction is read/write, or if stable_threshold
	// changes. Invalid sets are also updated after installation
	if (update_threshold(tx_tstamp) || tx->mos->count() || tx->nos->count()) {
	    // This procedure releases validation mutex
	    log_commit_single(fe, tx);
	} else {
	    // Read-only transaction and threshold does not need updating
	    mutex_validation->release();
	}
    } else {
	mutex_validation->release();
	// Transaction aborted.  Log abort record.
	log_abort(tx->tid);
	vq->remove_trans(&tx->tid);
        // Must release space for NOS/MOS
	delete tx->nos;
	delete tx->mos;
    }
    truncate_vqueue_if_needed();
    validate_timer.stop();  validate_time = validate_timer.elapsed();
    cumm_validate_timer.stop();
    return result;
}

// Transactions commited from within OR to manage its persistent state.
void TM::commit_exclusive(Transaction* transaction) {
    printf(">>> TM::commit_exclusive\n"); //DEBUG
    transaction->tid = Global_tstamp(OR_address(orx->config->ornum), 
				    Tstamp(TRUE));
    printf("---\n"); //DEBUG
    Data_Log_Record* data_record = new Data_Log_Record(transaction);
    printf("---\n"); //DEBUG
    Log_Index log_index = orx->log->append(data_record);
    printf("---\n"); //DEBUG
    orx->log->flush(log_index);
    data_record->install();
    orx->log->installed(log_index);
    printf("<<< TM:commit_exclusive\n"); //DEBUG
}

// \subsection{Commit for multiple ORs}

Ubits32 TM::prepare_coord(FE_manager* fe, Transaction const* tx) {
    Global_tstamp tx_tstamp(tx->tid);

    // Check if the transaction has already been aborted by a participant.
    int status = orx->tstatus->coordinator_add(tx->tid, tx->participants, fe);
    if (status == STATUS_ABORT) {
	abort(tx->tid);
	free_trans_space(tx);
	return OR_abort_other;
    }

    mutex_validation->grab();

    Ubits32 result = validate (fe, tx, tx_tstamp);
    if (result != OR_committed) {
	// Transaction aborted. 
	mutex_validation->release();
	abort(tx->tid);
	free_trans_space(tx); // Must release space for NOS/MOS
	return result;
    }

    // This method releases the validation mutex
    Log_Index index = log_committing(tx);
    orx->tstatus->set_log_index(tx->tid, index);
    bool read_only = tx->mos->count() == 0; // Cannot have nos != 0 and mos == 0
    status = orx->tstatus->vote_ok(tx->tid, orx->config->ornum, read_only);
    
    // If all votes are in, commit.
    if (status == STATUS_COMMIT) {
	commit(tx->tid);
	return OR_committed;
    }

    return 0;
}

Ubits32 TM::prepare_part(FE_manager* fe, Transaction const* tx, bool &force) {
    Global_tstamp tx_tstamp(tx->tid);

    mutex_validation->grab();

    Ubits32 result = validate (fe, tx, tx_tstamp);
    if (result != OR_committed) {
	// Transaction aborted
	mutex_validation->release();	
	log_abort(tx->tid);
	free_trans_space(tx); // Must release space for NOS/MOS
	return result;
    }
    
    // Only log read-only transaction if it increases stable threshold.
    if (update_threshold(tx_tstamp))
	force = TRUE;
    else force = FALSE;

    // XXX Seem to be forcing too often--don\'t need to force when read/write
    // transaction increases threshold, since threshold is in prepare record
    // sent to coordinator.
    bool read_write = tx->mos->count() != 0;
    if (force || read_write) {
	// This procedure releases validation mutex & sets force.
	Log_Index index = log_prepared(tx, force);

	if (read_write)
	    orx->tstatus->participant_add(tx->tid, index, fe);
	else orx->log->installed(index);
    }
    else mutex_validation->release();

    return OR_committed;
}

void TM::abort(Tid const& tid) {
    // Log abort record
    log_abort(tid);

    // Send abort messages to other participants who have responded.
    // If we aren\'t coordinator for transaction, no messages will be sent.
    or_or_message msg;
    msg.msgtype = OR_ABORT;
    msg.tid = tid;
    
    orx->tstatus->lock(); {
	Transaction_Status::Elements elts(orx->tstatus, tid);

	for (; elts.ok(); elts.next())
	    if (elts.or_num() != orx->config->ornum)
		orx->or_managers->send_message(elts.or_num(), &msg);
    } orx->tstatus->unlock();

    // Cancel object reservations, if they were made.
    Log_Index index = orx->tstatus->get_log_index(tid);
    // XXX Is this cast safe?  Is log\'s fetch operation sufficiently powerful?
    Data_Log_Record *dr = (Data_Log_Record *) orx->log->fetch(index);
    if (dr != 0) {
	dr->abort();
	dr->install();
	orx->log->installed(index);
    }

    // Remove entry from tstatus
    orx->tstatus->remove(tid);

    truncate_vqueue_if_needed();
}

void TM::commit(Tid const& tid) {
    OR_set participants;
    OR_num or_num;

    xactions++;

    // Send commit message to all participants (except us)
    or_or_message msg;
    msg.msgtype = OR_COMMIT;
    msg.tid = tid;

    // Find participants of transaction while we still know votes
    orx->tstatus->lock(); {   
	Transaction_Status::Elements elts(orx->tstatus, tid);
    
	for (; elts.ok(); elts.next())
	    participants.add(elts.or_num());
    } orx->tstatus->unlock();    
    
    participants.remove(orx->config->ornum);
    
    // Get log index of prepare record before we move to phase 2
    Log_Index l = orx->tstatus->get_log_index(tid);

    // Put transaction into phase 2 in tstatus (BEFORE telling participants),
    bool read_only = orx->tstatus->committed(tid);

    // Log committed record
    log_committed(tid, &participants);

    OR_set::Elements gen(&participants);
    
    while (gen.get(or_num)) 
	orx->or_managers->send_message(or_num, &msg);

    // Start update phase thread if necessary
    if (read_only)
	orx->log->installed(l);
    else start_update(l, &msg);

    truncate_vqueue_if_needed();
}

// \subsection{Validation operations}

bool TM::threshold_check (Global_tstamp& tstamp) {
    if (tstamp < threshold) return FALSE;
    th_assert (tstamp > threshold, "Received tstamp equal to threshold");
    return TRUE;
}

bool TM::invalid_obj_check (FE_manager* fe, Transaction const* tx) {
    Ros::Elements rgen(tx->ros);
    Modset::Elements mgen(tx->mos);

    Oref oref;
    OR_obj* obj;
    int num_slots;
    bool valid = TRUE;

    fe->read_lock();

    // When invalid set is empty, do not bother looping over ros/mos
    if (!fe->invalid_objs->empty()) {
	while (valid && rgen.get(oref)) {
	    if (fe->invalid_objs->member(oref))
		valid = FALSE;
	}
	
	while (valid && mgen.get(obj, oref, num_slots)) {
	    if (fe->invalid_objs->member(oref))
		valid = FALSE;
	}
    }

    fe->read_unlock();
    return valid;
}

Ubits32 TM::validate (FE_manager* fe, Transaction const* tx,
		      Global_tstamp& tx_tstamp) {

    Global_tstamp cur_tstamp = Global_tstamp(fe->id(), Tstamp(TRUE));

    if (cur_tstamp + MAX_IN_FUTURE < tx->tid)
	return OR_abort_other;

    if (!threshold_check (tx_tstamp)) {
	if (orx->config->debug_level > 0) {
	    fprintf(stderr, "Failed threshold check\n");
	}
	// Failed the threshold test.
	return OR_abort_tstamp;
    }
    if (!invalid_obj_check(fe, tx)) {
	// Failed the invalid object test.
	if (orx->config->debug_level > 0) {
	    fprintf(stderr, "Failed invalid object check\n");
	}
	return OR_abort_invcheck;
    }
    if (!vq->vqueue_check (tx)) {
	// Failed the Validation Queue test.
	if (orx->config->debug_level > 0) {
	    fprintf(stderr, "Failed VQ check\n");
	}
	return OR_abort_vqcheck;
    }
    bool is_space = reserve_object_space (tx);
    if (is_space) {
 	// Add the transaction to the VQ
 	vq->add_vqueue(tx);
	return OR_committed;
    } else {
	if (orx->config->debug_level > 0) {
	    fprintf(stderr, "Failed to allocate space\n");
	    return OR_abort_space;
	}
    }
    return OR_committed;
}

bool TM::reserve_object_space (Transaction const*) {
    return TRUE;
    // May be we should check if there is space available on disk
}

void TM::add_to_invalid_sets (FE_manager *fe, const Oref* wset, int wsize) {
    FE_map* fe_map = orx->fe_map;

    // The mutex_validation is already held by the caller.
    // Read lock the FE_map. For each FE_table, write lock it,
    // add the invalid objects and release the lock.
    // Not logging the invalid set.

    if (wsize == 0) return;

    fe_map->read_lock();
    MapGenerator<Address, FE_manager*> fgen(*fe_map);
    
    // This loop could be made more efficient by first iterating through
    // objects and then finding the FEs that have cached the segments.
    // Such an attempt ran into complexity problems due to FE-specific
    // state such as created_collection.


    Address fe_addr;
    FE_manager *fe2;
    while (fgen.get(fe_addr, fe2)) {
        // Skip FE that made transaction; its objects are valid.
	if (fe != NULL && fe->id() == fe_addr) {
	    continue;
	}

	fe2->write_lock();
	Invalid_set *inv_set = fe2->invalid_objs;
	Collection_num msg_num = 0;
	bool created_collection = FALSE;

	for (int i = 0; i < wsize; i++) {
	    Oref oref = wset[i];
	    Uint pageid = Oref_page(oref);
	    FE_state state = orx->cachedir->lookup(pageid, fe2);
	    if (state == Page_complete) {
		// Set the page to be reparable
		orx->cachedir->enter_single_page(pageid, fe2, Page_reparable);
	    }
	    if (state != Page_absent) {
		// Create a new collection of invalid objects for this
		// frontend if necessary. Insert the oref in the invalid
		// set
		if (!created_collection) {
		    created_collection = TRUE;
		    msg_num = inv_set->new_collection();
		    fe2->current_message = msg_num;
		    // Note: fe2 is locked so nobody could have
		    // changed the current message to a higher value.
		}
		inv_set->add_obj(oref, msg_num);
	    }
	}
	
	fe2->write_unlock();
	if (orx->config->send_updates_at_commit)
	    fe2->send_invalidation_message_if_needed();
    } // end of while loop for all the FEs
    fe_map->read_unlock();
}

void TM::add_to_FE_table (FE_manager *fe, Modset *nos) {
    if (fe == NULL || nos->count() == 0) return;

    OR_obj *o;
    Oref oref;
    int num_slots;
    Uint last_pageid = (Uint) -1; // Invalid pageid

    Modset::Elements iter(nos);
    while (iter.get(o, oref, num_slots)) {
	Uint pageid = Oref_page(oref);
	if (pageid == last_pageid) continue;
	last_pageid = pageid;
	// Check that the FE has allocation rights for the page
	th_assert(orx->cachedir->alloc_rights(pageid) == fe,
		  "Objects allocated on page without allocation rights");
	FE_state state = orx->cachedir->lookup(pageid, fe);
	if (state == Page_absent)
	    // Page is absent at FE. Mark the page as complete
	    orx->cachedir->enter_single_page(pageid, fe, Page_complete);
    }
}

bool TM::update_threshold(Global_tstamp& tstamp) {
    // The threshold is not updated here. It is only updated when the
    // VQ is truncated.

    Global_tstamp cur_tstamp = Global_tstamp(Address(), Tstamp(TRUE));

    if (tstamp > stable_threshold) {
	// Currently, setting the threshold to be tstamp + DELTA
	to_install_stable_threshold = tstamp.max(cur_tstamp) + THRESHOLD_DELTA;
	return TRUE;
    }
    to_install_stable_threshold = stable_threshold;
    return FALSE;
}

void TM::stat(OR_stat& s) {
    mutex_validation->grab(); {
	s.total_recv_time     = float_to_time(cumm_recv_timer.elapsed());
	s.total_validate_time = float_to_time(cumm_validate_timer.elapsed());
	s.total_send_time     = float_to_time(cumm_send_timer.elapsed());
	s.total_trans_time    = float_to_time(cumm_total_timer.elapsed());
	s.trans = xactions;
	cumm_total_timer.reset();
	cumm_recv_timer.reset();
	cumm_send_timer.reset();
	cumm_validate_timer.reset();
    } mutex_validation->release();
}

// \subsection{Logging operations}

void TM::log_commit_single(FE_manager* fe, Transaction const* tx) {
    int mos_count = tx->mos->count();
    Data_Log_Record* dr = 
      (mos_count == 0) ? new Data_Log_Record(): new Data_Log_Record(tx);
    Log_Record* cr = new Commit_Log_Record(tx->tid, to_install_stable_threshold);

    dr->commit();
    Log_Index l1 = orx->log->append(dr);
    Log_Index l2 = orx->log->append(cr);
    mutex_validation->release(); 

    orx->log->flush(l2);

    add_to_FE_table(fe, tx->nos); // Add the new xrefs

    dr->install();
    cr->install();
    orx->log->installed(l1);
    orx->log->installed(l2);

    // The objects have been installed. Can update the invalid
    // sets now. This is conservative in the sense that an FE
    // may have fetched the new version but its version will
    // be invalidated
    // Mark the transaction as installed in the VQ
    // Also set the stable_threshold value to the value that has been flushed
    mutex_validation->grab(); {
	if (mos_count > 0) {
	    int wsize;
	    const Oref* wset = orx->tm->vq->get_write_set(&tx->tid, wsize);
	    th_assert(mos_count == wsize, "Bad size of MOS in VQ");
	    add_to_invalid_sets (fe, wset, wsize);
	}
	bool found = vq->mark_installed(&tx->tid);
	th_assert(found, "Transaction not found in VQ");
	set_stable_threshold(TRUE);
    } mutex_validation->release(); 
    
    if (orx->config->debug_level > 1)
	printf("Logged commit_single record\n");
}

Log_Index TM::log_prepared(Transaction const* tx, bool &force) {
    Log_Record* pr = new Prepared_Log_Record(tx, tx->coordinator,
					     to_install_stable_threshold);

    Log_Index l1 = orx->log->append(pr);
    if (!coord_set->member(tx->coordinator)) {
	// Add log record for new coordinator
	// Stamos_Record* sr = new Stamos_Record(tx->coordinator, TRUE);
	// XXX What is to be done with sr??
	if (orx->config->debug_level > 1)
	    printf("Logged Stamos add record\n");
	force = TRUE;
    }

    // Remove old entries from coordinator set.
    // XXX Is it safe to truncate here?  We hold validation mutex, but removals
    // are not flushed until after mutex is released.
    truncate_coordinators();
    mutex_validation->release(); 

    if (force) {
	orx->log->flush(l1);
	
	// Set the stable_threshold value to what has been flushed
	set_stable_threshold(FALSE);
    }

    // If coordinator is new, must first force Stamos log record before 
    // adding to set, so that coordinator is contacted on recovery.
    coord_set->add(tx->coordinator, Tstamp(TRUE));

    if (orx->config->debug_level > 1)
	printf("Logged prepared record\n");
    return l1;
}

Log_Index TM::log_committing(Transaction const* tx) {
    Log_Record* cr = new Committing_Log_Record(tx,
					       to_install_stable_threshold,
					       tx->participants);

    Log_Index l1 = orx->log->append(cr);
    mutex_validation->release(); 
    // orx->log->flush(l1);

    // Set the stable_threshold value to what has been flushed
    set_stable_threshold(FALSE);
    if (orx->config->debug_level > 1)
	printf("Logged committing record\n");
    return l1;
}

void TM::log_committed(Tid const& tid, OR_set *participants) {
    Log_Record* cr = new Committed_Log_Record(tid, participants);

    Log_Index l = orx->log->append(cr);
    orx->log->flush(l);

    cr->install();
    orx->log->installed(l);

    if (orx->config->debug_level > 1)
	printf("Logged committed record\n");
}

void TM::log_abort(Tid const &tid) {
    Log_Record* ar = new Abort_Log_Record(tid);
    Log_Index l = orx->log->append(ar);          
    ar->install();
    orx->log->installed(l);

    if (orx->config->debug_level > 1)
	printf("Logged abort record\n");
}

void TM::log_done(Tid const &tid) {
    Log_Record* dr = new Done_Log_Record(tid);
    Log_Index l = orx->log->append(dr);          
    dr->install();
    orx->log->installed(l);

    if (orx->config->debug_level > 1)
	printf("Logged done record\n");
}

void TM::log_participant(Tid const &tid, OR_num participant, 
			 Prepared_Log_Record *rec, Log_Index index) {
    Log_Record* pr = new Participant_Log_Record(tid, participant, rec, index);
    Log_Index l = orx->log->append(pr);          
    pr->install();
    orx->log->installed(l);

    if (orx->config->debug_level > 1)
	printf("Logged participant prepare record for OR %d\n", participant);
}


void TM::set_stable_threshold(bool mutex_held) {
    // requires: mutex_validation has not been grabbed
    //           Called after the log has been flushed
    // effects: Sets the stable_threshold to the value that is on disk

    // This is the only procedure where the stable_threshold is set
    if (!mutex_held)
	mutex_validation->grab();
    // Other threads may have bumped up the stable_threshold already
    // So check if its value is still less than to_install_stable_threshold
    if (stable_threshold < to_install_stable_threshold) {
	stable_threshold = to_install_stable_threshold;
    }
    if (!mutex_held)
	mutex_validation->release();
}

void TM::mark_installed(Global_tstamp* ts) {
    // requires: Transaction corresponding to ts exists
    //           hould not be called by an internal routine of TM
    // effects:  Marks the corresponding transaction as
    //           installed

    mutex_validation->grab(); {
	bool found = vq->mark_installed(ts);
	th_assert(found, "Transaction not found in VQ (in background)");
    }
    mutex_validation->release();
}

void TM::truncate_vqueue_if_needed() {
    // effects: Remove "old" entries from the VQ
    
    // XXX Currently not taking the skew into account.

    Global_tstamp cur_tstamp = Global_tstamp(Address(), Tstamp(TRUE));
    Global_tstamp new_threshold = cur_tstamp + (-MAX_CLOCK_SKEW)
	+ (-MAX_NETWORK_DELAY);
    mutex_validation->grab(); {
	if (new_threshold < stable_threshold) {
	    // To maintain the invariant:  stable_threshold >= threshold
	    vq->truncate_vqueue(&new_threshold);
	    threshold = new_threshold;
	}
    }
    mutex_validation->release();
}

void TM::truncate_coordinators() {
    Tstamp now(TRUE);

    if (now < last_trunc + TRUNCATE_INTERVAL)
	return;

    OR_set or_set;
    coord_set->truncate(now + (-TRUNCATE_INTERVAL), &or_set);
    last_trunc = now;

    // Log records for removed coordinators
    OR_set::Elements elts(&or_set);
    OR_num or_num;

    while (elts.get(or_num)) {
	Stamos_Record* sr = new Stamos_Record(or_num, FALSE);
	Log_Index l1 = orx->log->append(sr);
	orx->log->installed(l1);

	if (orx->config->debug_level > 1)
	    printf("Truncating coordinator %d\n", or_num);
    }
}

void TM::free_trans_space(Transaction const *tx) {
    delete tx->mos;
    delete tx->nos;
}
