// Copyright 1995 Barbara Liskov

// MM code that is specific to in-place initialization.

#include <stdio.h>
#include <iostream.h>
#include "utils/arraysort.h"
#include "utils/fail.h"
#include "utils/bits.h"
#include "utils/intarray.h"
#include "utils/intset.h"
#include "utils/th_assert.h"
#include "utils/Timer.h"

#include "common/or_obj.h"
#include "or/or.h"
#include "or/or_config.h"
#include "or/cachedir.h"
#include "or/thread.h"

#include "dformat.h"
#include "segtable.h"
#include "segment.h"
#include "mm.h"
#include "handle.h"
#include "itable.h"
#include "logrecord.h"
#include "or/gc/gc.h"
#include "or/gc/partition.h"
#include "or/gc/collector.h"

MM_Handle* MM::read_object(Oref oref, bool fast) {
    // Look in the cache
    int s = Oref_segment(oref);
    int i = Oref_sindex(oref);

    // Fetch segment header
    Segment* segment = find_segment(s, TRUE);
    if (segment == 0) {
	// Segment is not currently in the cache.
	if (fast) return 0;

	// Make space for the segment in the cache and then retry.
	segment = find_segment(s, FALSE);
	return retry;
    }

    if (segment->missing()) {
	// Segment contents are being read from the disk
	if (fast) return 0;

	// Wait for the segment read to finish and then retry.
	segment->fetch();
	return retry;
    }

    // Fetch object
    OR_obj* obj = segment->pin(i);
    if (obj == 0) return 0;

    // Found it
    MM_Handle* result	= new MM_Handle;
    result->in_itable	= FALSE;
    result->oref_	= oref;
    result->obj_	= obj;
    result->entry	= segment;
    return result;
}

Segment* MM::find_segment(int segnum, bool fast) {
    // We need a loop around the whole lookup code because
    // pieces of alloc end up releasing "mm->mutex"
    // while they wait and therefore we have to restart the
    // whole lookup process to avoid problems caused by
    // stale reads.

    while (1) {
	Segment* seg = (Segment*) cache->lookup(segnum);
	if (seg != 0) return seg;

	if (fast) return 0;

	// Find the disk range occupied by the segment
	Disk_Range range;
	if (!segtable->find(segnum, range)) return 0;

	// Allocate a cache entry
	seg = (Segment*) cache->lookup(segnum);
	if (seg == 0) {
	    cache->alloc(segnum,
			 range.address << DISK_UNIT_SHIFT,
			 range.count << DISK_UNIT_SHIFT);
	}
//  #ifdef __linux__
//  	pth_yield(NULL);
//  #endif
    }
}

CacheEntry* MM::alloc_entry(int id, long address, long size) {
    Disk_Range r;
    r.address = address >> DISK_UNIT_SHIFT;
    r.count   = size >> DISK_UNIT_SHIFT;
    return (new Segment(id, r));
}

int MM::free_space() {
    return segtable->free_space();
}

void MM::install_object(Oref oref, OR_obj* obj, int num_slots) {
    Segment* segment = find_segment(Oref_segment(oref));
    assert(segment != 0);
    if (!segment->install(Oref_sindex(oref), obj, num_slots))
      th_fail("Object installation failed"); // used to be warn
}

void MM::clean_log() {
    Log_Index low, high;
    IntSet mod_segids; // the segments that must be written to disk anyway
    Timer t; // timing installations

    if (gc && gc_on) {
	printf("-> Begin GC phase:\n");
	t.start();
	install_time.start();
	// collect the segment indicies, not the segments themselves
	collect_segments(mod_segids, low, high);
	printf("collected mod_segids\n"); //DEBUG
	// XXX sorting segments is not happening yet
	gc->collector->select_partition(Collector::weighted_inset_commonality, 
					&mod_segids);
	printf("selected partition\n"); //DEBUG
	Uint savings = 
	    gc->partition_map->remove_segments(gc->collector->partition,
					       mod_segids);
	printf("removed segments from mod_segids\n"); //DEBUG
	printf("Saved %d segment reads\n", savings); //DEBUG
	gc->collector->snapshot();
	printf("Snapshotted.\n"); //DEBUG
	gc->collector->enter_roots();
	printf("Entered roots\n"); //DEBUG
	gc->collector->scan();
	printf("Scanned\n"); //DEBUG
	gc->collector->sweep();
	printf("Swept\n"); //DEBUG
	gc->collector->replace_outlist();
	printf("Replaced outlist\n"); //DEBUG
	write_modifications(mod_segids);
	printf("Wrote modifications\n"); //DEBUG
	orx->log->applied(low, high);
	printf("log->applied finished\n"); //DEBUG
	t.stop();
	install_time.stop();
	printf("clean_log + GC: %f seconds, %f seconds total\n", t.elapsed(),
	       install_time.elapsed());	
	printf("<- Done GC phase.\n");
    } else {
	t.start();
	install_time.start();
	// We get the set of segments that should be written out, sort these
	// segments for good disk scheduling, and then write out the segments.
	collect_segments(mod_segids, low, high);
	// XXXX sorting segments isn't happening yet
	write_modifications(mod_segids);
	
	// Clear out the log records
	orx->log->applied(low, high);
	t.stop();
	install_time.stop();
	printf("clean_log: %f seconds, %f seconds total\n", t.elapsed(),
	       install_time.elapsed());
    }
}

#undef SHOW_PURGING

void MM::collect_segments(IntSet& segments, Log_Index& low, Log_Index& high){
    Log* log = orx->log;

    // When should we stop scanning the log for more segments?
    int cur_size = log->current_size();
    int target_size = log->target_size();

    low = log->low();
    high = low - 1;

#ifdef SHOW_PURGING
    Timer coll;
    fprintf(stderr, "Log size: %d Log low: %d Log high: %d\n", cur_size, low, high);
    // Measure log record absorption
    int scanned = 0;
    int purged = 0;
    coll.start();
#endif

    Log_Index flushed =  orx->log->get_next_flush();

    segments.clear();
    while (cur_size > target_size) {
	// Can we get the segments from log[high+1]?
	unsigned int index = high+1;
	Log_Record* rec = log->fetch(index);
	if ((rec == 0) || !log->is_installed(index) || index >= flushed ) {
	    // Either this record has not been installed yet, or it
	    // has been removed from the log.
	    break;
	}

	high = index;
        mutex->grab();
	rec->get_modified_segments(&segments);
        mutex->release();
	cur_size -= rec->size();

#ifdef SHOW_PURGING
	scanned++;
	if (rec->absorbed()) purged++;
#endif
    }

#ifdef SHOW_PURGING
    coll.stop();
    fprintf(stderr, "Collect elapsed %f \n", coll.elapsed());
    fprintf(stderr, "cleaned: %3d records with %3d segments\n",
	    scanned, set.size());
    fprintf(stderr, "purged: %3d out of %3d records\n", purged, scanned);
    fprintf(stderr, "absorbed: %5d out of %5d mods\n", absorbed, mods);
    absorbed = 0;
    mods = 0;
#endif
}

// Sorting routine for the disk scheduler.
// Assumes that the caller holds "mm->mutex".
static int sort_by_location(void const* p1, void const* p2) {
    int s1 = *((int const*) p1);
    int s2 = *((int const*) p2);

    Disk_Range r1, r2;
    if (! orx->mm->segtable->find(s1, r1))
	th_fail("could not find disk range for segment");
    if (! orx->mm->segtable->find(s2, r2))
	th_fail("could not find disk range for segment");

    return (r1.address - r2.address);
}

void MM::sort_segments(IntArray& segments) {
  // Acquire mutex because of segtable->find in sort_by_location
  mutex->grab();
  // TODO: Rotational scheduling will probably do better
  ArraySort(segments, sort_by_location);
  mutex->release();
}

Segment* MM::fetch_segment(int segnum, bool fast, bool grab_mutex=TRUE) {
    bool missed = false;
    if (grab_mutex) {
	mutex->grab();
	// printf("MM:fetch_segment grabbed or->mm->mutex\n"); //DEBUG
    }
    Segment* seg = find_segment(segnum, fast);
    if (seg == 0)  {
        mutex->release();
        return 0; // segment not found
    }
    while (seg->missing()) {
        missed = true;
	seg->fetch();
    }
    modify_segment(seg, FALSE); // do not remove mods from itable
    seg->pin_segment(); // pin after modifying to avoid self-deadlock
    if (grab_mutex) {
	mutex->release();
	// printf("MM:fetch_segment released or->mm->mutex\n"); //DEBUG
    }
    if (!fast) {
	fetches++;
	if (missed) misses++; 
    }
    return seg;
}    


static int itable_mod_cmp(void const *v1, void const *v2) {
    Itable_Mod** mod1 = (Itable_Mod **) v1;
    Itable_Mod** mod2 = (Itable_Mod **) v2;
    return Oref_cmp((*mod1)->oref(), (*mod2)->oref());
}


void MM::modify_segment(Segment *seg, bool remove_mods) {

    if (seg->uptodate && ! remove_mods) return; // nothing to do

    int segnum = seg->id();
    Itable_Mods mods;
    itable->get_modifications(segnum, &mods);

    if (remove_mods) 
       qsort(mods.as_pointer(), mods.size(), sizeof(Itable_Mod*), itable_mod_cmp);

    // apply the pending modifications and remove entries from itable
    int modcount = mods.size();
    for (int i = 0; i < modcount; i++) {
	Itable_Mod* m = mods[i];
	if (! seg->uptodate) {
	    Oref oref = m->oref();
	    OR_obj* obj = m->object();
	    install_object(oref, obj, m->num_slots());
	}
	if (remove_mods) itable->remove(m);
	m->unref();
    }
    seg->uptodate = TRUE;
    installs += modcount;

    if (remove_mods && modcount > 0)  {
	// Some modified objects were removed from itable.
	orx->cachedir->alter(segnum, Page_reparable, Page_unreparable);
	seg->mark_dirty();
    }
}

void MM::write_modifications(IntSet const& segments) {
    // We just loop over the segments: read a segment, modify it,
    // and then write it out.
    mutex->grab();    
    IntSet::Elements g = &segments;
    int segnum;
    while (g.get(segnum)) {
	Segment* seg = find_segment(segnum, FALSE);
        seg->fetch(true);
	// XXXseg->coop_fetch(); // try to fetch from FEs if needed
	// apply mods
	modify_segment(seg, TRUE); // remove mods from itable
	// write to disk
	seg->write();
    }
    mutex->release();
}

void MM::resize_dspace(int) {
    fprintf(stderr, "cannot change disk utilization for in-place policy\n");
    return;
}

// Initialization and recovery code

static Disk_Segment* make_seg(int id, Disk_Range, int num);
// requires	Segment occupying range can store object with "num" total slots
// effects	Create a disk segment with a single object big enough to
//		have "num" total slots ("num" includes header)

void MM::create_seg_table() {
    // Find maximum number of segments we may need to map
    // XXX We assume that all segments are at least "segprefsize/2" big
    int max_segs = 2 * (super->size / super->segprefsize);

    // Find number of entries per leaf segment.  Leave slightly more than a
    // kilobyte in each leaf segment to account for various overheads.
    int segsize = super->segprefsize << DISK_UNIT_SHIFT;

    int segs_per_leaf = (segsize - 1100) / sizeof(Disk_Range);
    int num_leaves = (max_segs + segs_per_leaf - 1) / segs_per_leaf;
    int blks_per_leaf = super->segprefsize;
    int lslots = sizeof(Disk_LeafObj)/sizeof(OR_slot) 
      + (segs_per_leaf - 1)*sizeof(Disk_Range)/sizeof(OR_slot);

    // Assign disk range for root segment.
    // (Allow kilobyte for overhead again.)
    Uint root_seg_size= (sizeof(Disk_Range)*num_leaves + 1024) 
      >> DISK_UNIT_SHIFT;
    if (root_seg_size < super->segprefsize)
	root_seg_size = super->segprefsize;

    Disk_Range root_range;
    root_range.address  = DISK_SB_2+1;
    root_range.count    = root_seg_size;
    super->segtable	= root_range;

    // Now allocate root segment
    int rslots =  sizeof(Disk_RootObj)/sizeof(OR_slot) 
		  + (segs_per_leaf - 1)*sizeof(Disk_Range)/sizeof(OR_slot);
    Disk_Segment* r_seg = make_seg(0, root_range, rslots);
    Disk_RootObj* r = (Disk_RootObj*) r_seg->pages[0].lookup(0);
    th_assert(r, "Root object not allocated on the page");

    r->max_segments	= max_segs;
    r->num_leaves	= num_leaves;
    r->segs_per_leaf	= segs_per_leaf;

    // Now initialize the leaf entries in the root segment

    // Next available disk address
    Disk_Address avail = root_range.address + root_range.count;

    // Next segment number to be mapped (skip root and leaves)
    int segnum = num_leaves + 1;

    for (int i = 0; i < num_leaves; i++) {
	Disk_Range leaf_range;
	leaf_range.address = avail;
	leaf_range.count   = blks_per_leaf;

	r->ranges[i].address = leaf_range.address;
	r->ranges[i].count = leaf_range.count;

	Disk_Segment* l_seg = make_seg(i+1, leaf_range, lslots);
	Disk_LeafObj* l = (Disk_LeafObj*) l_seg->pages[0].lookup(0);
	th_assert(l, "Leaf object not allocated on the page");

	l->first = segnum;
	l->count = segs_per_leaf;
	for (int j = 0; j < segs_per_leaf; j++) {
	    // Mark unmapped entries with zero ranges
	    l->ranges[j].address = 0;
	    l->ranges[j].count = 0;
	}

	mutex->grab(); {
	    if (! disk->write(l_seg, leaf_range)) sysfail(orx->config->disk);
	} mutex->release();

	delete [] l_seg;

	avail += blks_per_leaf;
	segnum += segs_per_leaf;
    }

    mutex->grab(); {
	if (! disk->write(r_seg, root_range)) sysfail(orx->config->disk);
    } mutex->release();

    delete [] r_seg;

    // Also need to read in the segment table content
    recover_seg_table();
}

static Disk_Segment* make_seg(int, Disk_Range range, int num_slots) {
    int num_pages = (range.count << DISK_UNIT_SHIFT)/Page_size;

    // Object requires "num" slots + overhead for one object in the page
    int empty_bytes = Page::empty_page_bytes();
    int slots_avail = (empty_bytes - Overhead_bytes)/Slot_size;

    // Ensure that object can fit on the segment in n pages
    if ((Uint)num_slots > (num_pages - 1) * Page_size/Slot_size + slots_avail)
      th_fail("Object too big while creating segment object");

    Disk_Segment* contents = (Disk_Segment*) new Page[num_pages];

    //    contents->h.magic	  = DISK_SEG_MAGIC;
    //    contents->h.id	  = id;
    //    contents->h.type	  = DISK_SEG_SPECIAL;
    //    contents->h.num_pages = num_pages;
    contents->pages[0].init();

    // Allocate object on the page and then overwrite it with a bigger obj
    Slot *obj = contents->pages[0].allocate(0, slots_avail - OR_obj_headers);
    if (obj == 0) th_fail("Object too large for page");
    return contents;
}

void MM::recover_seg_table() {
    segtable = new Segment_Table;
    segtable->recover();
}
