#include <list>
#include <queue>
#include <iostream>
#include <arc/mdsdiscovery.h>
#include <arc/mdsquery.h>
#include <arc/standardbrokers.h>
#include <arc/jobsubmission.h>
#include <arc/jobftpcontrol.h>
#include <arc/joblist.h>
#include <arc/url.h>
#include <arc/target.h>
#include <arc/notify.h>
#include <sys/wait.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include "gridjm.h"

/* This software is released under GPL. It borrows heavily from ngjm, a
 * job manager which has the licence description given below.
 * Antti Hyvrinen
 */

/* ngjm.cpp - see ngjm.h for futher information */
/* Copyright 2003 Henrik Thostrup Jensen and Jesper Ryge Leth
 * All rights reserved.
 *
 * This file is part of NGProxy.
 *
 * NGProxy is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * NGProxy is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with NGProxy; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
                                                                                          
/**
 * NGProxy will change name to "Nunabungo Gravdeha Jelahero Mongyput" /
 * NG Job Manager, or ngjm, but for now it still called NGProxy
 */


using namespace std;

int main(int argc, char **argv) {
    if (argc >= 2)
        setenv("MAXGRID", argv[1], true);

    // Output pid to the pid file
    pid_t pid = getpid();
    char *pidstr = (char *)malloc(6);
    sprintf(pidstr, "%i\n", pid);
    int fd = open("/tmp/gridjm.pid", O_WRONLY | O_CREAT, S_IRUSR | S_IWUSR);
    if (fd < 0) {
        cout << "Error opening pid file" << endl;
        return 1;
    }
    int i = write(fd, pidstr, strlen(pidstr));
    close(fd);
    free(pidstr);

    GridJM jm = GridJM();
    if (argc == 3)
        jm.uds = 0;

    int result = jm.start();
}

void* uds_listener(void* pointer) {

    // Do stupid typecast (thread functions must use void pointers)
    GridJM* jm = (GridJM*) pointer;
    // Start listening
    int retval = 0;
    if (jm->uds) {
        notify(INFO) << "Using uds for communication" << endl;
        retval = jm->Listen();
    } else {
        notify(INFO) << "Using net for communication" << endl;
        retval = jm->NetListen();
    }
    // Exit thread (we only get here on failure)
    pthread_exit((void *) retval);
}

int GridJM::start() {
    SetNotifyLevel(INFO);
    char *maxgrid = getenv("MAXGRID");
    if (maxgrid)
        this->maxgridjobs = atoi(maxgrid);
    else
        this->maxgridjobs = 30;

    this->update_interval = 120;

    mainloop();
}

void GridJM::mainloop() {

    // Standard posix thread init
    pthread_attr_t thread_attr;
    pthread_attr_init(&thread_attr);
    pthread_attr_setdetachstate(&thread_attr, PTHREAD_CREATE_JOINABLE);

    // create thread for listening on unix domain socket
    pthread_t listener_thread;
    int retval = pthread_create(&listener_thread, 0, uds_listener, this);
    if (retval != 0) {
        notify(ERROR) << "Error creating UDS Listener thread, exiting" << endl;
        exit(1);
    }

    // Init mutex variable and other stuff used in mainloop
    pthread_mutex_init(&new_jobs_lock, NULL);
    pthread_mutex_init(&current_jobs_lock, NULL);
    unsigned int time_left = 0;
    unsigned int last_time = time(0);
    this->running = true;

    do {

        // Check if it time tu update job status
        if (time(0) - last_time > update_interval) {
            UpdateJobStatus();
            last_time = time(0);
        }

        HandleNewJobs();

        time_left = sleep(1);
        if (time_left) {
            // If we get here, we where interrupted during sleep
            notify(INFO) << "The sleeper awakens" << endl; // We should do clean up here
        }

        // Clean the forked downloads
        int size = forked_downloads.size();
        list<pid_t>::iterator cpid = forked_downloads.begin();

        while (cpid != forked_downloads.end()) {
            int rval = waitpid(*cpid, NULL, WNOHANG); // try to clean
            if (rval > 0) {
                forked_downloads.erase(cpid++);
            }
            else {
                cpid++;
            }
        }

    } while (running);

    return;
}

void GridJM::HandleNewJobs() {
    /**
     * We now check for the arrival of new jobs
     * This is done by acquiring lock, see if there is anything
     * in the new_jobs vector, we take one out, release the lock.
     * and then we submit the job.
         * If there are no new jobs and we get here, it means the queue
         * for sending jobs is empty, so we might want to ask more jobs.
         * Call the procedure...
     */
    bool new_job = false;    // Indicates whethere there are any new jobs
    Xrsl* xrsl = 0;
    pthread_mutex_lock(&new_jobs_lock);

    if (!new_jobs.empty()) {  // See if any new jobs arrived
        notify(INFO) << "New job(s) appeared in queue" << endl;
        // Take the first element in the queue and pop it
        xrsl = new_jobs.front();
        new_jobs.pop();
        new_job = true;
    }
    pthread_mutex_unlock(&new_jobs_lock); // Release lock

    if (new_job) {
        URL *jobid = SubmitJob(xrsl);
        notify(DEBUG) << jobid->str() << endl;
        if (!jobid) {
            // Submission failed - do nothing
            // Hey? Are we leaking here sometimes?
            notify(DEBUG) << "Job submission failed - mem leak" << endl;
            return;
        }
        else {
            // Job was submitted succesfully
            // Create jobstatus object and push onto current_jobs
            JobStatus* js = new JobStatus(xrsl, jobid);
            current_jobs[jobid->str()] = js;
            // We need a counter telling how many jobs are
            // in current jobs
            pthread_mutex_lock(&current_jobs_lock);
            num_current_jobs += 1;
            pthread_mutex_unlock(&current_jobs_lock);
        }
    } else {
        GetMoreJobs();
    }

    return;
}


void GridJM::GetMoreJobs() {
    if (xrslsocket != -1) {
        pthread_mutex_lock(&current_jobs_lock);
        notify(INFO) << "Number of jobs in grid: " << num_current_jobs << endl;
        if (num_current_jobs < this->maxgridjobs) {
            notify(DEBUG) << "Asking for more jobs from the socket" << endl;
            write(xrslsocket, "# getjob 1\n", 11);
        } else {
            notify(INFO) << "Grid is full" << endl;
        }
        pthread_mutex_unlock(&current_jobs_lock);
    }
    return;
}

int GridJM::Listen() {

    // Create socket
    int server_socket = socket(AF_UNIX, SOCK_STREAM,0);
    if (server_socket == -1) {
        notify(ERROR) << "Error creating socket" << endl;
        return -1;
    }

    // Set linger (timeout) close connection options
    {
        int on = 1;
        setsockopt(server_socket,SOL_SOCKET,SO_REUSEADDR,(void*)(&on),sizeof(on));
    };

    // Create socket address struct used for binding
    struct sockaddr_un server;
    server.sun_family = AF_UNIX;
    strcpy(server.sun_path, socketdir.c_str());

    // Delete any previous file
    unlink(socketdir.c_str());

    int retval; // used for return values

    // Bind socket to file
    retval = bind(server_socket, (struct sockaddr*) &server, sizeof(server));
    if (retval == -1) {
        notify(ERROR) << "Error binding socket to file " + socketdir << endl;
        return -1;
    }

    // Start listening
    retval = listen(server_socket,2);
    if (retval == -1) {
        notify(ERROR) << "Listen failed" << endl;
        return -1;
    }
    notify(INFO) << "Listener thread started" << endl;

    struct sockaddr_un client;
    int addrlen = sizeof(client);

    // Start accepting connections
    for (;;) {
        // Wait for connection
        int socket_handle = accept(server_socket, (struct sockaddr*)&client,
                                   (socklen_t*) &addrlen);
        if (socket_handle == -1) {
            notify(ERROR) << "Error accepting connetion" << endl;
        }
        else {
            notify(INFO) << "Accepted Connection" << endl;
            // If we want to respond faster to multiple connections we 
            // could create a thread here
            xrslsocket = socket_handle;
            HandleConnection(socket_handle);
        }
        usleep(50000); // .05 seconds (creates a max of 20 connections pr sec)
    }
    return 0;
}

int GridJM::NetListen() {

    // Create socket
    int server_socket = socket(PF_INET, SOCK_STREAM,0);
    if (server_socket == -1) {
        notify(ERROR) << "Error creating socket" << endl;
        return -1;
    }

    int on = 1;
    setsockopt(server_socket,SOL_SOCKET,SO_REUSEADDR,(void*)(&on),sizeof(on));

    // Create socket address struct used for binding
    struct sockaddr_in server;
    struct in_addr sin_addr;
    inet_aton("0.0.0.0", &sin_addr);
//    inet_aton(INADDR_LOOPBACK, &sin_addr);

    server.sin_family = AF_INET;
    server.sin_port = htons(12345);
    server.sin_addr = sin_addr;

    int retval; // used for return values

    // Bind socket to file
    retval = bind(server_socket, (struct sockaddr*) &server, sizeof(server));
    if (retval == -1) {
        notify(ERROR) << "Error binding socket to file " + socketdir << endl;
        return -1;
    }

    // Start listening
    retval = listen(server_socket,2);
    if (retval == -1) {
        notify(ERROR) << "Listen failed" << endl;
        return -1;
    }
    notify(INFO) << "Listener thread started" << endl;

    struct sockaddr_un client;
    int addrlen = sizeof(client);

    // Start accepting connections
    for (;;) {
        // Wait for connection
        int socket_handle = accept(server_socket, (struct sockaddr*)&client,
                                   (socklen_t*) &addrlen);
        if (socket_handle == -1) {
            notify(ERROR) << "Error accepting connetion" << endl;
        }
        else {
            notify(INFO) << "Accepted Connection" << endl;
            // If we want to respond faster to multiple connections we 
            // could create a thread here
            xrslsocket = socket_handle;
            HandleConnection(socket_handle);
        }
        usleep(50000); // .05 seconds (creates a max of 20 connections pr sec)
    }
    return 0;
}
void GridJM::HandleConnection(int socket_handle) {

    const int MAX_BUFFER = 1024;
    char buffer[MAX_BUFFER];
    // EOF needs to be cleared so we need a sockstream because of
    // the interface of clearerr.
    FILE *sockstream = fdopen(socket_handle, "rw");

    bool ok = 1;

    while (ok) {
        int bytes_read = 0;
        int read_still = 1;
        string recieved_message = "";
        /* Read everything from the socket, then create an
         * xrsl object from the string recived and attempt
         * to submit it.
         *
         * You could do nasty stuff if you just kept sending stuff
         * to it, but we dont care about that right now
         */
        do {
            bytes_read = read(socket_handle,buffer,MAX_BUFFER);
            switch (bytes_read) {
                case -1: // Error
                    notify(ERROR) << "Error during read from socket" << endl;
                    ok = 0;
                    break;
                case 0: // Read EOF (done reading)
                    notify(INFO) << "Read EOF" << endl;
                    clearerr(sockstream);
                    read_still = 0;
                    ok = 0;
                    goto lost_connection;
                    break;
                default: { // Reading
                    string partial_message (buffer, bytes_read);
                    if (partial_message[partial_message.size()-1] == EOF) {
                        notify(INFO) << "Received EOF" << endl;
                        read_still = 0;
                        clearerr(sockstream); // Get on with reading
                        partial_message[partial_message.size()-1] = '\0';
                    }

                    recieved_message += partial_message;
                    // FIXME we need size check here
                }
            }

        } while (read_still);

        // Ok now were done reading
        // Create xrsl object
        Xrsl* xrsl = NULL;
        try {
            xrsl = new Xrsl(recieved_message);
        }
        catch (XrslError e) {
            notify(WARNING) << "Recieved broken xrsl, will not submit";
            return;
        }

        // Grap xrsl vector lock, push the Xrsl object into
        // it, and realease lock
        pthread_mutex_lock(&new_jobs_lock);
        new_jobs.push(xrsl);
        pthread_mutex_unlock(&new_jobs_lock);
        recieved_message = "";

        notify(INFO) << "New job was put into queue" << endl;
    }
    return;

lost_connection:
    notify(ERROR) << "Connection was lost" << endl;
    return;
}

URL *GridJM::SubmitJob(Xrsl *xrsl) {

    // Update the available clusters if list is old
    if ((clusters.size() == 0) || (time(0) - clupd > clupdp)) {
        try {
            clusters = GetClusterResources();
            clupd = time(0);
        } catch (MDSDiscoveryError e) {
            notify(WARNING) << "Error: " << e.what() << endl;
            return new URL("");
        }
    }

    try {
        PerformXrslValidation(*xrsl);
    } catch (ARCLibError e) {
        notify(INFO) << e.what() << endl;
        return new URL("");
    }

    std::list<Queue> queuelist = GetQueueInfo(clusters, MDS_FILTER_CLUSTERINFO,
                true, "", 20);

    std::list<Target> targetlist;
    try {
        targetlist = ConstructTargets(queuelist, *xrsl);
    } catch (TargetError e) {
        notify(ERROR) << e.what() << endl;
        return new URL("");
    }

    PerformStandardBrokering(targetlist);

    JobSubmission submit(*xrsl, targetlist, false);

    URL *jobID;
    string JobName = "unknown";

    if (xrsl->IsRelation("jobname")) {
        JobName = xrsl->GetRelation("jobname").GetSingleValue();
    }

    try {
        jobID = new URL(submit.Submit(20));
        AddJobID(jobID->str(), JobName);
    } catch (JobSubmissionError e) {
        notify(WARNING) << "Error: " << e.what() << endl;
        return new URL("");
    }

    notify(INFO) << "Submitted job " << *jobID << endl;

    return jobID;
}

void GridJM::UpdateJobStatus() {

    /**
     * Iterate over the jobs, get the status for each one,
     * and act if the state is "FINISHED" or "FAILED"
     */

    /* Get current jobs as a list of strings */
    list<string> jobstrs;
    for (map<string,JobStatus*>::iterator jsi = current_jobs.begin();
            jsi != current_jobs.end(); jsi++)
        jobstrs.push_back((*jsi).first);

    bool incremented = false;

    /* Perform query for the list */
    list<Job> joblist = GetJobInfo(jobstrs);

    for (list<Job>::iterator jli = joblist.begin();
        jli != joblist.end(); jli++) {

        string status = "";
        string error = "";

        if (jli->status == "") {
            notify(INFO) << "Job " << jli->id << " is sent but can't be seen (yet)" << endl;
            status = "NOTFOUND";
            error = "";
        }
        else {
            status = jli->status;
            error = jli->errors;
        }
        string prev_status  = current_jobs[jli->id]->getPrevStatus();
        time_t prev_time    = current_jobs[jli->id]->getPrevTime();
        time_t current_time = time(NULL);
        current_jobs[jli->id]->setNewStatus(status);


        notify(INFO) << "Status change " << jli->id << endl;
        notify(INFO) << "  " << prev_status << " => " << status;
        notify(INFO) << " (" << current_time - prev_time << ")" << endl;

        if (strncmp(status.c_str(),"FINISHED", 8) == 0) {

            // Check if error occured during job execution
            if (error.empty()) {
                // Normal job exit
                notify(INFO) << "Job " << jli->id <<
                    " finished correctly and was removed from current job list"
                    << endl;
                // Fetch the job (safe, as not found
                // jobs are not in state FINISHED)
                FetchJob(jli->id);
                // Free memory of jli
//                JobStatus *js = current_jobs[jli->id];
                current_jobs.erase(jli->id);
//                delete js;
                pthread_mutex_lock(&current_jobs_lock);
                -- num_current_jobs;
                pthread_mutex_unlock(&current_jobs_lock);
                // XXX Write instrumentation here:
                // finished correctly
            }
            // Error occured during job execution
            else {
                notify(INFO) << cout <<
                    "Job with id " << jli->id << " failed" << endl;
                // handle job failure here
                bool job_resubmitted = HandleJobFailure(jli->id);
            }
        }

        else if ((strncmp(status.c_str(), "INLRMS", 6) == 0) &&
             (strncmp(status.c_str(), "INLRMS:Q", 8) != 0)) {
            // Job is running
        }

        // Check that the job has been doing some progress in
        // grid.
        // The longest status string is MAXSTATUSLENGTH chars
        else if (strncmp(status.c_str(), prev_status.c_str(), MAXSTATUSLENGTH)
                == 0) {
            notify(INFO) << "Job with id " <<
                jli->id << " has not changed" << endl;

            if (current_time - prev_time > s_timeout) {
                // Job has been on the same state for
                // too long.
                HandleJobFailure(jli->id);
                // XXX Write instrumentation here
                // state stall
            }
        }

        else if (status.compare("FAILED") == 0) {
            cout << "Job with id " << jli->id << "failed" << endl;
            // handle job failure here
            HandleJobFailure(jli->id);
        }
    }
    return;
}

void GridJM::FetchJob(string jobname) {
    pid_t pid = fork();
    if (pid == 0) {
        execlp("arcget", "arcget", "-dir", "/home/aehyvari/ngdownload",
                jobname.c_str(), (char *)NULL);
    }
    // In parent
    forked_downloads.push_back(pid);
    return;
}

bool GridJM::HandleJobFailure(string jobname) {

    string jobid = jobname;
    JobStatus *js = current_jobs[jobname];

    cout << "Job " << jobid << " failed - attempting to recover" << endl;

    if (js->getAttempts() >= RESUBMIT_ATTEMPTS) {
        // If the job has been submitted more than three times
        // and still failed we throw it out
        notify(DEBUG) << "Job " << jobid << " has been re-submitted "
                 << RESUBMIT_ATTEMPTS << " times - removing it from job list";
        notify(WARNING) << "Job " << jobid <<
            " was removed from current job list, due to maximum number of submissions attempts being reached" << endl;

        // Cancel job
        try {
            CancelJob(jobid);
            CleanJob(jobid);
            RemoveJobID(jobid);
        }
        catch (ARCLibError e) {
            notify(ERROR) << "Cancel or clean of job " << jobid << " failed:" << endl;
            notify(ERROR) << e.what();
        }
        current_jobs.erase(jobname);
//        delete js;
        // XXX Write instrumentation here:
        // state stall + deleted
        pthread_mutex_lock(&current_jobs_lock);
        -- num_current_jobs;
        pthread_mutex_unlock(&current_jobs_lock);
        return false;
    }
    // If we get here we attempt to resubmit the job

    Xrsl* xrsl = js->getXrsl();
    string old_cluster = js->getJobid()->Host();

    xrsl->AddSimpleRelation("cluster", operator_neq, old_cluster);

    URL *new_jobid = SubmitJob(xrsl);

    // Just something to play with.
//    BCEntry badc = BCEntry::BCEntry(getClusterName(new_jobid), 0);
//    badclusters.push_back(badc);

    js->newSubmission(new_jobid);
    notify(DEBUG) << jobid << " was resubmitted as " << new_jobid;

    js->setNewStatus("RESUB");

    return true; // jobs was sumbitted again
}

// ========================================================
// JobStatus class from here
// ========================================================


JobStatus::JobStatus(Xrsl *xrsl, URL *jobid) {

    this->attempts = 0;           // Init number of attempts
    this->xrsl = xrsl;            // Save xrsl
    this->current_jobid = jobid; // Save jobid
    this->prev_status = "INITIAL";
    this->prev_time = time(NULL);

    // Put the cluster name into the cluster vector
    string current_cluster = jobid->Host();
    this->clusters.push_back(current_cluster);

}


JobStatus::~JobStatus() {

    delete xrsl;
    delete current_jobid;
    return;
}


Xrsl* JobStatus::getXrsl() {
    return xrsl;
}


URL *JobStatus::getJobid() {

    return current_jobid;
}


int JobStatus::getAttempts() {

    return attempts;
}

string JobStatus::getPrevStatus() {

    return prev_status;
}

time_t JobStatus::getPrevTime() {

    return prev_time;
}

void JobStatus::setNewStatus(string s) {

    // We only want to update the time if the state has changed
    if (strncmp(prev_status.c_str(), s.c_str(), MAXSTATUSLENGTH) != 0) {
        prev_time = time(NULL);
        prev_status = s;
    }
    return;
}


void JobStatus::newSubmission(URL *new_jobid) {
    clusters.push_back(current_jobid->Host());
    delete current_jobid;
    current_jobid = new_jobid;
    attempts++;
    return;
}
