// Copyright 1995 Barbara Liskov

// \section{Disk Implementation}

#include "utils/basic.h"
#include "utils/fail.h"
#include "utils/compat.h"

#include <fcntl.h>
#include <errno.h>
#include <stdio.h>
#include <string.h>
#include <limits.h>
#include <unistd.h>
#include <sys/types.h>

// #ifdef __linux__
# include "AsynchIO.h"
# define INIT_AIO       AIO = new AsynchIO
# define CLEANUP_AIO    delete AIO
# define AIO_SUBMIT_LIO AIO->listio
# define AIO_GET_VALUE  AIO->value
// #else
//   extern "C" {
//    // XXX Prevents inclusion of <sys/buf.h> because g++ no like it
// #  define _SYS_BUF_H_
// #include <aio.h>
// }
// # define INIT_AIO       if (0) {}
// # define CLEANUP_AIO    if (0) {}
// # define AIO_SUBMIT_LIO lio_listio
// # define AIO_GET_VALUE  aio_return
// #endif


#include "utils/th_assert.h"
#include "or/or.h"
#include "or/thread.h"

#include "disk.h"
#include "mm.h"

implementArray(Disk_RangeList,Disk_Range)

Disk* Disk::open(char const* name) {
    int fd = ::open(name, O_RDWR|O_SYNC, 0);
    if (fd < 0)
	return 0;

    return new Disk(fd);
}

Disk* Disk::open_fd(int fd) {
    return new Disk(fd);
}

Disk::Disk(int fd) {
    device	= fd;
    active	= 0;
    zero_time(busy);
    zero_time(start);
    total.count = 0;
    total.blocks = 0;
    zero_time(total.elapsed);
    INIT_AIO;
    stats = new or_st_disk[Disk_NumTypes];
    for (int i = 0; i < Disk_NumTypes; i++) {
	stats[i].count = 0;
	stats[i].blocks = 0;
	zero_time(stats[i].elapsed);
    }
}

Disk::~Disk() {
    // Only close the device if it was opened successfully
    if (device >= 0) ::close(device);
    CLEANUP_AIO;
    delete [] stats;
}

bool Disk::read(void* buffer, Disk_Range range, Disk_OpType ot) {
    return do_io(Read_Op, 1, &buffer, &range, ot);
}

bool Disk::readv(int n, void* buffer[], Disk_Range range[], Disk_OpType ot) {
    return do_io(Read_Op, n, buffer, range, ot);
}

bool Disk::write(void const* buffer, Disk_Range range, Disk_OpType ot) {
    void *buf = (void *) buffer;
    return do_io(Write_Op, 1, &buf, &range, ot);
}

bool Disk::writev(int n, void const* buffer[], Disk_Range range[], Disk_OpType ot) {
    return do_io(Write_Op, n, (void **) buffer, range, ot);
}

void Disk::stat(OR_stat& s) {
    get_wallclock_time(s.disk_time);

    s.disk_used = busy;
    if (active > 0) {
	// The time when the current activity started is stored in "start".
	// The actual busy time is "busy + (now - start)".
	or_st_time current_active = s.disk_time;
	sub_time(current_active, start);
	add_time(s.disk_used, current_active);
    }

    s.disk_total	= total;
    s.disk_mreads	= stats[Disk_Read];
    s.disk_mwrites	= stats[Disk_Write];
    s.disk_freads	= stats[Disk_FRead];
    s.disk_ireads	= stats[Disk_IRead];
    s.disk_creads	= stats[Disk_CRead];
    s.disk_fwrites	= stats[Disk_FWrite];
    s.disk_cwrites	= stats[Disk_CWrite];
}

bool Disk::do_io(Op op, int n, void* buffer[], Disk_Range range[], Disk_OpType ot) {
    // Use aio because the normal read/write calls use
    // an implicit file pointer that is hard to share
    // between many threads.
    aiocb*   aio = new aiocb [n];
    aiocb**  list = new (aiocb *)[n];
    bool    status;

    for (int i = 0; i < n; i++) {
	aio[i].aio_fildes	= device;
	aio[i].aio_offset	= range[i].address << DISK_UNIT_SHIFT;
	aio[i].aio_buf		= buffer[i];
	aio[i].aio_nbytes	= range[i].count << DISK_UNIT_SHIFT;
	aio[i].aio_reqprio	= AIO_PRIO_DFL;
	aio[i].aio_lio_opcode	= (op == Read_Op) ? LIO_READ : LIO_WRITE;
#ifndef __linux__
	aio[i].aio_sigevent.sigev_signo = 0;
#endif

	list[i] = &aio[i];

    }

    // Set-up stuff for performance measurement
    or_st_time op_start;
    get_wallclock_time(op_start);
    if (active == 0) start = op_start;
    active++;

    // Release the mutex for the duration of the IO
    orx->mm->mutex->release(); {
	status = TRUE;
	if (AIO_SUBMIT_LIO(LIO_WAIT, list, n, 0) < 0) {
	    if (errno == EAGAIN) {
		warn("Disk I/O resulted in error EAGAIN. Trying again.");
		active--;
		delete [] aio;
		delete [] list;
		sleep(1);
		orx->mm->mutex->grab();
		return do_io(op, n, buffer, range, ot);
	    }
	    status = FALSE;
	} else {
	    for (int i = 0; i < n; i++) {
		unsigned int result = AIO_GET_VALUE(list[i]);
		if (result != range[i].count << DISK_UNIT_SHIFT)
		    status = FALSE;
	    }
	}
    } orx->mm->mutex->grab();

    // Record time at which operation ended
    or_st_time op_finish;
    get_wallclock_time(op_finish);

    // Get elapsed time for operation
    or_st_time elapsed = op_finish;
    sub_time(elapsed, op_start);

    // Update op-specific stats
    stats[ot].count++;
    for (int i = 0; i < n; i++)
	stats[ot].blocks += range[i].count;
    add_time(stats[ot].elapsed, elapsed);

    // Update total stats
    total.count++;
    for (int i = 0; i < n; i++)
	total.blocks += range[i].count;
    add_time(total.elapsed, elapsed);

    active--;
    if (active == 0) {
	// Active period has finished.  Update "busy" time.
	or_st_time busy_period = op_finish;
	sub_time(busy_period, start);
	add_time(busy, busy_period);
    }

    delete [] aio;
    delete [] list;
    return status;
}
