// Copyright 1995 Barbara Liskov

// \section{Disk Implementation}

#include "common/basic.h"
#include "common/compat.h"

#include <fcntl.h>
#include <stdio.h>
#include <string.h>
#include <limits.h>
#include <unistd.h>
#include <sys/types.h>

extern "C" {

// XXX Prevent the inclusion of <sys/buf.h> because the C++ compiler
// does not like it.
#define _SYS_BUF_H_

#include <aio.h>
}

#include "common/fe_or_msg.h"
#include "common/th_assert.h"
#include "or/or.h"
#include "or/thread.h"

#include "disk.h"
#include "mm.h"

implementArray(Disk_RangeList,Disk_Range)

Disk* Disk::open(char const* name) {
    int fd = ::open(name, O_RDWR|O_SYNC, 0);
    if (fd < 0)
	return 0;

    return new Disk(fd);
}

Disk* Disk::open_fd(int fd) {
    return new Disk(fd);
}

Disk::Disk(int fd) {
    device	= fd;
    active	= 0;
    zero_time(busy);
    zero_time(start);

    total.count = 0;
    total.blocks = 0;
    zero_time(total.elapsed);

    stats = new or_st_disk[Disk_NumTypes];
    for (int i = 0; i < Disk_NumTypes; i++) {
	stats[i].count = 0;
	stats[i].blocks = 0;
	zero_time(stats[i].elapsed);
    }
}

Disk::~Disk() {
    // Only close the device if it was opened successfully
    if (device >= 0) ::close(device);
    delete [] stats;
}

bool Disk::read(void* buffer, Disk_Range range, Disk_OpType ot) {
    return do_io(Read_Op, buffer, range, ot);
}

bool Disk::write(void const* buffer, Disk_Range range, Disk_OpType ot) {
    return do_io(Write_Op, (void*) buffer, range, ot);
}

void Disk::stat(or_stat& s) {
    get_wallclock_time(s.disk_time);

    s.disk_used = busy;
    if (active > 0) {
	// The time when the current activity started is stored in "start".
	// The actual busy time is "busy + (now - start)".
	or_st_time current_active = s.disk_time;
	sub_time(current_active, start);
	add_time(s.disk_used, current_active);
    }

    s.disk_total	= total;
    s.disk_mreads	= stats[Disk_Read];
    s.disk_mwrites	= stats[Disk_Write];
    s.disk_freads	= stats[Disk_FRead];
    s.disk_ireads	= stats[Disk_IRead];
    s.disk_creads	= stats[Disk_CRead];
    s.disk_fwrites	= stats[Disk_FWrite];
    s.disk_cwrites	= stats[Disk_CWrite];
}

bool Disk::do_io(Op op, void* buffer, Disk_Range range, Disk_OpType ot) {
    // Use aio because the normal read/write calls use
    // an implicit file pointer that is hard to share
    // between many threads.

    aiocb   aio;
    aiocb*  list[1];
    bool    status;

    aio.aio_fildes	= device;
    aio.aio_offset	= range.address << DISK_UNIT_SHIFT;
    aio.aio_buf		= buffer;
    aio.aio_nbytes	= range.count << DISK_UNIT_SHIFT;
    aio.aio_reqprio	= AIO_PRIO_DFL;
    aio.aio_lio_opcode	= (op == Read_Op) ? LIO_READ : LIO_WRITE;
    aio.aio_sigevent.sigev_signo = 0;

    list[0] = &aio;

    // Set-up stuff for performance measurement
    or_st_time op_start;
    get_wallclock_time(op_start);
    if (active == 0) start = op_start;
    active++;

    // Release the mutex for the duration of the IO
    or->mm->mutex->release(); {
	status = TRUE;
	if (lio_listio(LIO_WAIT, list, 1, 0) < 0)
	    status = FALSE;
	else {
	    int result = aio_return(&aio);
	    if (result != range.count << DISK_UNIT_SHIFT)
		status = FALSE;
	}
    } or->mm->mutex->grab();

    // Record time at which operation ended
    or_st_time op_finish;
    get_wallclock_time(op_finish);

    // Get elapsed time for operation
    or_st_time elapsed = op_finish;
    sub_time(elapsed, op_start);

    // Update op-specific stats
    stats[ot].count++;
    stats[ot].blocks += range.count;
    add_time(stats[ot].elapsed, elapsed);

    // Update total stats
    total.count++;
    total.blocks += range.count;
    add_time(total.elapsed, elapsed);

    active--;
    if (active == 0) {
	// Active period has finished.  Update "busy" time.
	or_st_time busy_period = op_finish;
	sub_time(busy_period, start);
	add_time(busy, busy_period);
    }

    return status;
}
