/*
*         OpenPBS (Portable Batch System) v2.3 Software License
*
* Copyright (c) 1999-2000 Veridian Information Solutions, Inc.
* All rights reserved.
*
* ---------------------------------------------------------------------------
* For a license to use or redistribute the OpenPBS software under conditions
* other than those described below, or to purchase support for this software,
* please contact Veridian Systems, PBS Products Department ("Licensor") at:
*
*    www.OpenPBS.org  +1 650 967-4675                  sales@OpenPBS.org
*                        877 902-4PBS (US toll-free)
* ---------------------------------------------------------------------------
*
* This license covers use of the OpenPBS v2.3 software (the "Software") at
* your site or location, and, for certain users, redistribution of the
* Software to other sites and locations.  Use and redistribution of
* OpenPBS v2.3 in source and binary forms, with or without modification,
* are permitted provided that all of the following conditions are met.
* After December 31, 2001, only conditions 3-6 must be met:
*
* 1. Commercial and/or non-commercial use of the Software is permitted
*    provided a current software registration is on file at www.OpenPBS.org.
*    If use of this software contributes to a publication, product, or
*    service, proper attribution must be given; see www.OpenPBS.org/credit.html
*
* 2. Redistribution in any form is only permitted for non-commercial,
*    non-profit purposes.  There can be no charge for the Software or any
*    software incorporating the Software.  Further, there can be no
*    expectation of revenue generated as a consequence of redistributing
*    the Software.
*
* 3. Any Redistribution of source code must retain the above copyright notice
*    and the acknowledgment contained in paragraph 6, this list of conditions
*    and the disclaimer contained in paragraph 7.
*
* 4. Any Redistribution in binary form must reproduce the above copyright
*    notice and the acknowledgment contained in paragraph 6, this list of
*    conditions and the disclaimer contained in paragraph 7 in the
*    documentation and/or other materials provided with the distribution.
*
* 5. Redistributions in any form must be accompanied by information on how to
*    obtain complete source code for the OpenPBS software and any
*    modifications and/or additions to the OpenPBS software.  The source code
*    must either be included in the distribution or be available for no more
*    than the cost of distribution plus a nominal fee, and all modifications
*    and additions to the Software must be freely redistributable by any party
*    (including Licensor) without restriction.
*
* 6. All advertising materials mentioning features or use of the Software must
*    display the following acknowledgment:
*
*     "This product includes software developed by NASA Ames Research Center,
*     Lawrence Livermore National Laboratory, and Veridian Information
*     Solutions, Inc.
*     Visit www.OpenPBS.org for OpenPBS software support,
*     products, and information."
*
* 7. DISCLAIMER OF WARRANTY
*
* THIS SOFTWARE IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND. ANY EXPRESS
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT
* ARE EXPRESSLY DISCLAIMED.
*
* IN NO EVENT SHALL VERIDIAN CORPORATION, ITS AFFILIATED COMPANIES, OR THE
* U.S. GOVERNMENT OR ANY OF ITS AGENCIES BE LIABLE FOR ANY DIRECT OR INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
* OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* This license will be governed by the laws of the Commonwealth of Virginia,
* without reference to its choice of law rules.
*/
/*
 * job_recov.c - This file contains the functions to record a job
 * data struture to disk and to recover it from disk.
 *
 * The data is recorded in a file whose name is the job_id.
 *
 * The following public functions are provided:
 *  job_save()   - save the disk image
 *  job_recov()  - recover (read) job from disk
 */

#include <pbs_config.h>   /* the master config generated by configure */

#include <pthread.h>

#include <sys/types.h>
#include <sys/param.h>
#include "pbs_ifl.h"
#include <errno.h>
#include <fcntl.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <unistd.h>
#include <libxml/parser.h>
#include <libxml/tree.h>
#include <pthread.h>
#include "server_limits.h"
#include "list_link.h"
#include "attribute.h"
#include "pbs_job.h"
#include "log.h"
#include "../lib/Liblog/pbs_log.h"
#include "../lib/Liblog/log_event.h"
#include "lib_ifl.h"
#include "svrfunc.h"
#include "server.h"
#include "alps_constants.h"
#if __STDC__ != 1
#include <memory.h>
#endif
#ifndef PBS_MOM
#include "array.h"
#include "../lib/Libutils/u_lock_ctl.h" /* lock_ss, unlock_ss */
#include "job_func.h"
#else
#include "../resmom/mom_job_func.h"
#endif
#include "array.h"
#include "ji_mutex.h"
#include "job_recov.h"
#include "policy_values.h"

#ifndef TRUE
#define TRUE 1
#define FALSE 0
#endif

#define JOBBUFSIZE 2048
#define MAX_SAVE_TRIES 3
#define BUFSIZE 1024

#ifdef PBS_MOM
int recov_tmsock(int, job *);
extern unsigned int  pbs_mom_port;
extern unsigned int  pbs_rm_port;
extern int           multi_mom;
#else
extern char         *pbs_o_host;
extern char          server_name[];
#endif

extern int job_qs_upgrade(job *, int, char *, int);

/* global data items */

extern char  *path_jobs;
extern const char *PJobSubState[];
extern int LOGLEVEL;

const int DEFAULT_ARRAY_RECOV_SIZE = 101;

/* data global only to this file */


int assign_tag_len_5(

  job     **pj,    /* M */ /* job information to fill into */
  xmlChar  *tag,     /* I */ /* xml tag */
  xmlChar  *content) /* I */  /* content of the xml node */

  {
  job *pjob = *pj;
  int rc = PBSE_NONE;
  if (!(strncmp((const char *)tag, JOBID_TAG, 5)))
    snprintf(pjob->ji_qs.ji_jobid, PBS_MAXSVRJOBID + 1, "%s", (const char*)content);
  else if(!(strncmp((const char *)tag, STATE_TAG, 5)))
    pjob->ji_qs.ji_state = atoi((const char*)content);
  else if (!(strncmp((const char *)tag, QUEUE_TAG, 5)))
    snprintf(pjob->ji_qs.ji_queue, PBS_MAXQUEUENAME + 1, "%s", (const char*)content);
  else
    rc = -1;

  return rc;
  }

int assign_tag_len_6(

  job     **pj,    /* M */ /* job information to fill into */
  xmlChar  *tag,     /* I */ /* xml tag */
  xmlChar  *content) /* I */ /* content of the xml node */

  {
  int rc = PBSE_NONE;

#ifdef PBS_MOM
  job *pjob = *pj;
  
  if (!(strncmp((const char *)tag, TASKID_TAG, 6)))
    pjob->ji_taskid = atoi((const char*)content);
  else if(!(strncmp((const char *)tag, NODEID_TAG, 6)))
    pjob->ji_nodeid = atoi((const char*)content);
  else
    rc = -1;
#endif /* PBS_MOM */

  return rc;
  }

int assign_tag_len_7(

  job     **pj,    /* M */ /* job information to fill into */
  xmlChar  *tag,     /* I */ /* xml tag */
  xmlChar  *content) /* I */  /* content of the xml node */

  {
  int rc = PBSE_NONE;
  job *pjob = *pj;

  if (!(strncmp((const char *)tag, VERSION_TAG, 7)))
    pjob->ji_qs.qs_version = atoi((const char*)content);
  else
    rc = -1;

  return rc;
  }

int assign_tag_len_8(

  job     **pj,    /* M */  /* job information to fill into */
  xmlChar  *tag,     /* I */  /* xml tag */
  xmlChar  *content) /* I */  /* content of the xml node */

  {
  int rc = PBSE_NONE;
  job *pjob = *pj;

  if (!(strncmp((const char *)tag, SUBSTATE_TAG, 8)))
    pjob->ji_qs.ji_substate = atoi((const char*)content);
  else if(!(strncmp((const char *)tag, MOM_PORT_TAG, 8)))
    pjob->ji_qs.ji_un.ji_exect.ji_momport = atoi((const char*)content);
  else if (!(strncmp((const char *)tag, SVR_ADDR_TAG, 8)))
    pjob->ji_qs.ji_un.ji_momt.ji_svraddr = (unsigned long) atol((const char*)content);
  else
    rc = -1;

  return rc;
  }

int assign_tag_len_9(

  job     **pj,    /* M */ /* job information to fill into */
  xmlChar  *tag,     /* I */ /* xml tag */
  xmlChar  *content) /* I */  /* content of the xml node */

  {
  int rc = PBSE_NONE;
  job *pjob = *pj;

  if (!(strncmp((const char *)tag, FROM_HOST_TAG, 9)))
    pjob->ji_qs.ji_un.ji_newt.ji_fromaddr = (unsigned long) atol((const char*)content);
  else
    rc = -1;

  return rc;
  }

svrattrl *fill_svrattr_info(

  const char *aname,    /* I */ /* attribute name */
  const char *avalue,   /* I */ /* attribute value */
  const char *rname,   /* i */ /* resource name */
  char       *log_buf,  /* O */ /* error buffer */
  size_t      buf_len)  /* I */ /* len of error buffer */

  {
  size_t vsize = 0;
  svrattrl *pal = NULL;

  if (avalue)
    vsize = strlen(avalue) + 1;

  if ((pal = attrlist_create(aname, rname, vsize)))
    {
    if (avalue) 
      strcpy(pal->al_value, avalue);
    CLEAR_LINK(pal->al_link);
    }
  else
    snprintf(log_buf, buf_len, "Error: could not allocate memory for svrattrl in %s", __func__);

  return pal;
  }

int assign_tag_len_10(

  job     **pj,      /* M */ /* job information to fill into */
  xmlChar  *tag,     /* I */ /* xml tag */
  xmlChar  *content) /* I */ /* content of the xml node */

  {
  int  rc = PBSE_NONE;
  job *pjob = *pj;

  if (!(strncmp((const char *)tag, STIME_TAG, 10)))
    pjob->ji_qs.ji_stime = atol((const char*)content);
  else if(!(strncmp((const char *)tag, FPREFIX_TAG, 10)))
    snprintf(pjob->ji_qs.ji_fileprefix, PBS_JOBBASE + 1, "%s", (const char*)content);
  else if (!(strncmp((const char *)tag, MOM_RPORT_TAG, 10)))
    pjob->ji_qs.ji_un.ji_exect.ji_mom_rmport = atoi((const char*)content);
  else if (!(strncmp((const char *)tag, QUE_TIME_TAG, 10)))
    pjob->ji_qs.ji_un.ji_routet.ji_quetime = atol((const char*)content);
  else
    rc = -1;

  return rc;
  } /* END assign_tag_len_10*/

int assign_tag_len_11(

  job     **pj,    /* M */ /* job information to fill into */
  xmlChar  *tag,     /* I */ /* xml tag */
  xmlChar  *content) /* I */ /* content of the xml node */

  {
  int  rc = PBSE_NONE;
  job *pjob = *pj;

  if (!(strncmp((const char *)tag, REC_TYPE_TAG, 11)))
    pjob->ji_qs.ji_un_type = atoi((const char*)content);
  else if(!(strncmp((const char *)tag, FROM_SOCK_TAG, 11)))
    pjob->ji_qs.ji_un.ji_newt.ji_fromsock = atoi((const char*)content);
  else if (!(strncmp((const char *)tag, SCRT_SIZE_TAG, 11)))
    pjob->ji_qs.ji_un.ji_newt.ji_scriptsz = atoi((const char*)content);
  else if (!(strncmp((const char *)tag, MOM_ADDR_TAG, 11)))
    pjob->ji_qs.ji_un.ji_exect.ji_momaddr = (unsigned long) atol((const char*)content);
  else if (!(strncmp((const char *)tag, EXIT_STAT_TAG, 11)))
    pjob->ji_qs.ji_un.ji_momt.ji_exitstat = atoi((const char*)content);
  else
    rc = -1;

  return rc;
  } /* END assign_tag_len_11 */


int assign_tag_len_12(

  job     **pj,    /* M */ /* job information to fill into */
  xmlChar  *tag,     /* I */ /* xml tag */
  xmlChar  *content) /* I */ /* content of the xml node */

  {
  int  rc = PBSE_NONE;
  job *pjob = *pj;

  if (!(strncmp((const char *)tag, SRV_FLAGS_TAG, 12)))
    pjob->ji_qs.ji_svrflags = atoi((const char*)content);
  else if(!(strncmp((const char *)tag, RQUE_TIME_TAG, 12)))
    pjob->ji_qs.ji_un.ji_routet.ji_rteretry = atol((const char*)content);
  else
    rc = -1;

  return rc;
  } /* END assign_tag_len_12 */

int assign_tag_len_13(

  job     **pj,    /* M */ /* job information to fill into */
  xmlChar  *tag,     /* I */ /* xml tag */
  xmlChar  *content) /* I */ /* content of the xml node */

  {
  int rc = PBSE_NONE;
  job *pjob = *pj;

  if (!(strncmp((const char *)tag, EXEC_UID_TAG, 13)))
    pjob->ji_qs.ji_un.ji_momt.ji_exuid = (unsigned int) atoi((const char*)content);
  else if(!(strncmp((const char *)tag, EXEC_GID_TAG, 13)))
    pjob->ji_qs.ji_un.ji_momt.ji_exgid = atoi((const char*)content);
#ifdef PBS_MOM
  else if(!(strncmp((const char *)tag, STDOUT_TAG, 13)))
    pjob->ji_stdout = atoi((const char*)content);
  else if(!(strncmp((const char *)tag, STDERR_TAG, 13)))
    pjob->ji_stderr = atoi((const char*)content);
#endif /* PBS_MOM */
  else
    rc = -1;

  return rc;
  } /* END assign_tag_len_13 */

int assign_tag_len_17(

  job     **pj,    /* M */ /* job information to fill into */
  xmlChar  *tag,     /* I */ /* xml tag */
  xmlChar  *content) /* I */ /* content of the xml node */

  {
  int rc = PBSE_NONE;
  job *pjob = *pj;

  if (!(strncmp((const char *)tag, DST_QUEUE, 17)))
    snprintf(pjob->ji_qs.ji_destin, PBS_MAXROUTEDEST + 1, "%s", (const char*)content);
  else
    rc = -1;

  return rc;
  } /* END assign_tag_len_17 */

int assign_job_field(

  job     **pjob,    /* M */ /* job information to fill into */
  xmlNode *xml_node, /* I */ /* current node to parse */
  char     *log_buf, /* O */ /* error message buffer */
  size_t    buf_len) /* I */ /* size of error message buffer */
 
  {
  xmlChar  *tag = (xmlChar *)xml_node->name;
  xmlChar  *content;
  size_t    len;
  int rc = -1;

  content = xmlNodeGetContent(xml_node);

  if (content)
    {
    len = strlen((char *)tag);
    switch (len)
      {
      case 5:
        rc = assign_tag_len_5(pjob, tag, content);
        break;
      case 6:
        rc = assign_tag_len_6(pjob, tag, content);
        break;
      case 7:
        rc = assign_tag_len_7(pjob, tag, content);
        break;
      case 8:
        rc = assign_tag_len_8(pjob, tag, content);
        break;
      case 9:
        rc = assign_tag_len_9(pjob, tag, content);
        break;
      case 10:
        rc = assign_tag_len_10(pjob, tag, content);
        break;
      case 11:
        rc = assign_tag_len_11(pjob, tag, content);
        break;
      case 12:
        rc = assign_tag_len_12(pjob, tag, content);
        break;
      case 13:
        rc = assign_tag_len_13(pjob, tag, content);
        break;
      case 17:
        rc = assign_tag_len_17(pjob, tag, content);
        break;
      }
      if (rc == -1) 
        snprintf(log_buf, buf_len, "error: invalid tag found %s", tag);

      xmlFree(content);
    }
    else
      snprintf(log_buf, buf_len, "Error: xml tag %s did not have a value", tag);

  return rc;
  } /* END assign_job_field */


void decode_attribute(

  svrattrl *pal,
  job **pjob,
  bool freeExisting)

  {
  int index;
  job *pj = *pjob;

  /* find the pbs_attribute definition based on the name */

  index = find_attr(job_attr_def, pal->al_name, JOB_ATR_LAST);

  if (index < 0)
    index = JOB_ATR_UNKN;

  if (freeExisting)
    {
    job_attr_def[index].at_free(&pj->ji_wattr[index]);
    }

  if (index == JOB_ATR_hold)
    {
    // JOB_ATR_hold is written to file as a number so it won't decode correctly
    pj->ji_wattr[index].at_val.at_long = strtol(pal->al_value, NULL, 10);
    }
  else
    {
    job_attr_def[index].at_decode(
      &pj->ji_wattr[index],
       pal->al_name,
       pal->al_resc,
       pal->al_value,
       ATR_DFLAG_ACCESS);
    }

  if (job_attr_def[index].at_action != NULL)
    job_attr_def[index].at_action(&pj->ji_wattr[index], pj, ATR_ACTION_RECOV);

  pj->ji_wattr[index].at_flags =  pal->al_flags & ~ATR_VFLAG_MODIFY;
  } // END decode_attribute()


int fill_resource_list(

  job        **pj, 
  xmlNodePtr   resource_list_node, 
  char        *log_buf,
  size_t       buflen,
  const char  *aname)

  {
  xmlNodePtr resNode = NULL;
  int        rc = PBSE_NONE;
  bool       element_found = false;
  bool       freeExisting = true;

  for (resNode = resource_list_node->children; resNode != NULL; resNode = resNode->next)
    {
    /* skip text children, only process elements */
    if (!strcmp((const char *)resNode->name, text_name))
      continue;

    element_found = true;
      
    xmlChar  *value = xmlNodeGetContent(resNode);
    svrattrl *pal = NULL;

    if ((pal = fill_svrattr_info(aname, (const char*)value, (const char *)resNode->name, log_buf, buflen)))
      {
      char *attr_flags;
      unsigned int flags;
      if ((attr_flags = (char *)xmlGetProp(resNode, (xmlChar *)AL_FLAGS_ATTR)))
        {
        flags = (unsigned int)atoi((char *)attr_flags);
        xmlFree(attr_flags);
        pal->al_flags = flags;
        }
      decode_attribute(pal,pj,freeExisting);
      freeExisting = false;
      free(pal);
      }
    else
      rc = -1;

    if (value) 
      xmlFree(value);
    }

  if (element_found == false)
    {
    snprintf(log_buf, buflen, "%s", "no Resource_List nodes were found in the xml");
    rc = -1;
    }

  return(rc);
  }


int parse_attributes(

  job     **pj,       /* M */ /* job information to fill into */
  xmlNode *attr_node, /* I */ /* attribute node to parse */
  char    *log_buf,   /* O */ /* error message buffer */
  size_t   buf_len)   /* I */ /* size of error message buffer */

  {
  int           rc = PBSE_NONE;
  xmlNode      *cur_node = NULL;
  xmlNode      *resource_list_node = NULL;
  xmlNode      *resources_used_node = NULL;
  xmlNode      *complete_req_node = NULL;
  bool          element_found = false;

  for (cur_node = attr_node->children; cur_node != NULL && rc == PBSE_NONE; cur_node = cur_node->next)
    {
    /* skip text children, only process elements */
    if (!strcmp((const char *)cur_node->name, text_name))
      continue;

    element_found = true;
      
    if (!(strcmp((const char*)cur_node->name,  ATTR_l)))
      resource_list_node = cur_node;
    else if (!(strcmp((const char*)cur_node->name,  ATTR_used)))
      resources_used_node = cur_node;
    else if (!(strcmp((const char *)cur_node->name, ATTR_req_information)))
      complete_req_node = cur_node;
    else
      {
      svrattrl *pal = NULL;
      xmlChar *value = xmlNodeGetContent(cur_node);
      if ((pal = fill_svrattr_info((const char*)cur_node->name, (const char*)value, NULL, log_buf, buf_len)))
        {
        xmlChar      *attr_flags;
        unsigned int  flags;
        if ((attr_flags = xmlGetProp(cur_node, (xmlChar *)AL_FLAGS_ATTR)))
          {
          flags = (unsigned int)atoi((char *)attr_flags);
          xmlFree(attr_flags);
          pal->al_flags = flags;
          }
          
        decode_attribute(pal, pj,true);

        free(pal);
        }
      else
        rc = -1;

      if (value)
        xmlFree(value);
      }
    
    }
    
  if (rc == PBSE_NONE && resource_list_node) 
    rc = fill_resource_list(pj, resource_list_node, log_buf, buf_len, ATTR_l);
  if (rc == PBSE_NONE && resources_used_node)
    rc = fill_resource_list(pj, resources_used_node, log_buf, buf_len, ATTR_used);
  if ((rc == PBSE_NONE) &&
      (complete_req_node))
    rc = fill_resource_list(pj, complete_req_node, log_buf, buf_len, ATTR_req_information);
  else if (element_found == false)
    {
    snprintf(log_buf, buf_len, "%s", "Error: there were no job attributes found"); 
    rc = -1;
    }

  return(rc);
  } /* END parse_attributes */


int check_fileprefix(

  const char *filename, /* I */  /* name of the Job file */
  job   **pjob,         /* M */  /* job structure to compare */
  char   *log_buf,      /* O */  /* error buffer */
  size_t  buf_len)      /* I */  /* error buffer lenght */

  {
  char *pn;
  job *pj = *pjob;
  int   rc = PBSE_NONE;

#ifdef PBS_MOM
  char  fileid[MAXPATHLEN];
#endif

  /* Does file name match the internal name? */
  /* This detects ghost files */
  pn = strrchr((char *)filename, (int)'/');
  if (pn)
    pn++;
  else
    pn = (char *)filename;

#ifndef PBS_MOM
  if (strncmp(pn, pj->ji_qs.ji_fileprefix, strlen(pj->ji_qs.ji_fileprefix)) != 0)
#else
  if(multi_mom != 0)
    {
    sprintf(fileid,"%s%d",pj->ji_qs.ji_fileprefix,pbs_rm_port);
    }
  else
    {
    strcpy(fileid,pj->ji_qs.ji_fileprefix);
    }
  if (strncmp(pn, fileid, strlen(fileid)) != 0)
#endif
    {
    /* mismatch, discard job */

    snprintf(log_buf, buf_len, "Job Id %s does not match file name for %s",
      pj->ji_qs.ji_jobid,
      filename);

    rc = -1;
    }

    return rc;
  }


int parse_job_dom(

  const char  *filename,     /* I */ /* filename */
  job        **pjob,         /* M */ /* pointer to a pointer of a job structure */
  xmlNodePtr   root_element, /* I */ /*Root element of the dom */
  char        *log_buf,      /* O */ /* buffer for error message */
  size_t       buf_len)      /* I */ /* Size of error message */

  {
  xmlNode *cur_node = NULL;
  xmlNode *attributeNode = NULL;
  bool     element_found = false;

  int rc = PBSE_NONE;

  for (cur_node = root_element->children; cur_node != NULL && rc == PBSE_NONE; cur_node = cur_node->next)
    {
    /* skip text children, only process elements */
    if (!strcmp((const char *)cur_node->name, text_name))
      continue;

    element_found = true;
      
    if (!(strcmp((const char*)cur_node->name, ATTRIB_TAG)))
      attributeNode = cur_node;
    else
      if ((rc = assign_job_field(pjob, cur_node, log_buf, buf_len)))
        break;
    }
    
  if (!rc && attributeNode)
    {
    if (!(rc = check_fileprefix(filename, pjob, log_buf, buf_len)))
      rc = parse_attributes(pjob, attributeNode, log_buf, buf_len);  
    }
  else if (!attributeNode && !rc)
    {
    snprintf(log_buf, buf_len, "Missing required %s tag", ATTRIB_TAG);
    rc = -1;
    }
    
  if (element_found == false)
    snprintf(log_buf, buf_len, "%s", "Error: no children xml tags were found");

  return(rc);
  }  /* END parse_job_dom */



/*
 * add_fix_fields() - add xml nodes (that correspond to some of the fields in ji_qs fields of the job structure) 
 *                    to the document. 
 */

void add_fix_fields(

  xmlNodePtr *rnode, /* M root node */
  const job *pjob)   /* I pointer to job from which nodes will be created */

 {
 char buf[BUFSIZE];
 xmlNodePtr root_node = *rnode;

 snprintf(buf, sizeof(buf), "%d", pjob->ji_qs.qs_version);
 xmlNewChild(root_node, NULL, (xmlChar *)VERSION_TAG, (xmlChar *)buf);
 snprintf(buf, sizeof(buf), "%d", pjob->ji_qs.ji_state);
 xmlNewChild(root_node, NULL, (xmlChar *)STATE_TAG, (xmlChar *)buf);
 snprintf(buf, sizeof(buf), "%d", pjob->ji_qs.ji_substate);
 xmlNewChild(root_node, NULL, (xmlChar *)SUBSTATE_TAG, (xmlChar *)buf);
 snprintf(buf, sizeof(buf), "%d", pjob->ji_qs.ji_svrflags);
 xmlNewChild(root_node, NULL, (xmlChar *)SRV_FLAGS_TAG, (xmlChar *)buf);
 snprintf(buf, sizeof(buf), "%ld", pjob->ji_qs.ji_stime);
 xmlNewChild(root_node, NULL, (xmlChar *)STIME_TAG, (xmlChar *)buf);
 xmlNewChild(root_node, NULL, (xmlChar *)JOBID_TAG, (xmlChar *)pjob->ji_qs.ji_jobid);
 xmlNewChild(root_node, NULL, (xmlChar *)FPREFIX_TAG, (xmlChar *)pjob->ji_qs.ji_fileprefix); 
 xmlNewChild(root_node, NULL, (xmlChar *)QUEUE_TAG, (xmlChar *)pjob->ji_qs.ji_queue);
 xmlNewChild(root_node, NULL, (xmlChar *)DST_QUEUE, (xmlChar *)pjob->ji_qs.ji_destin);
 } /* END add_fix_fields */


/*
 * add_union_fields() - add xml nodes (that correspond to some of the fields in ji_qs fields of the job structure) 
 *                    to the document. 
 */

void add_union_fields(

  xmlNodePtr *rnode,  /* M document's root node */
  const job  *pjob)    /* I job pointer */

 {
 char buf[BUFSIZE];
 xmlNodePtr root_node = *rnode;

 int type = pjob->ji_qs.ji_un_type;
 snprintf(buf, sizeof(buf), "%d", type);
 xmlNewChild(root_node, NULL, (xmlChar *)REC_TYPE_TAG, (xmlChar *)buf);

 switch (type)
   {
   case JOB_UNION_TYPE_NEW:
     snprintf(buf, sizeof(buf), "%lu", pjob->ji_qs.ji_un.ji_newt.ji_fromaddr);
     xmlNewChild(root_node, NULL, (xmlChar *)FROM_HOST_TAG, (xmlChar *)buf);
     snprintf(buf, sizeof(buf), "%d", pjob->ji_qs.ji_un.ji_newt.ji_fromsock);
     xmlNewChild(root_node, NULL, (xmlChar *)FROM_SOCK_TAG, (xmlChar *)buf);
     snprintf(buf, sizeof(buf), "%d", pjob->ji_qs.ji_un.ji_newt.ji_scriptsz);
     xmlNewChild(root_node, NULL, (xmlChar *)SCRT_SIZE_TAG, (xmlChar *)buf);
     break;  
   case JOB_UNION_TYPE_EXEC:
     snprintf(buf, sizeof(buf), "%lu", pjob->ji_qs.ji_un.ji_exect.ji_momaddr);
     xmlNewChild(root_node, NULL, (xmlChar *)MOM_ADDR_TAG, (xmlChar *)buf);
     snprintf(buf, sizeof(buf), "%d", pjob->ji_qs.ji_un.ji_exect.ji_momport);
     xmlNewChild(root_node, NULL, (xmlChar *)MOM_PORT_TAG, (xmlChar *)buf);
     snprintf(buf, sizeof(buf), "%d", pjob->ji_qs.ji_un.ji_exect.ji_mom_rmport);
     xmlNewChild(root_node, NULL, (xmlChar *)MOM_RPORT_TAG, (xmlChar *)buf);
     break;
   case JOB_UNION_TYPE_ROUTE:
     snprintf(buf, sizeof(buf), "%ld", pjob->ji_qs.ji_un.ji_routet.ji_quetime);
     xmlNewChild(root_node, NULL, (xmlChar *)QUE_TIME_TAG, (xmlChar *)buf);
     snprintf(buf, sizeof(buf), "%ld", pjob->ji_qs.ji_un.ji_routet.ji_rteretry);
     xmlNewChild(root_node, NULL, (xmlChar *)RQUE_TIME_TAG, (xmlChar *)buf);
     break;
   case JOB_UNION_TYPE_MOM:
     snprintf(buf, sizeof(buf), "%lu", pjob->ji_qs.ji_un.ji_momt.ji_svraddr);
     xmlNewChild(root_node, NULL, (xmlChar *)SVR_ADDR_TAG, (xmlChar *)buf);
     snprintf(buf, sizeof(buf), "%d", pjob->ji_qs.ji_un.ji_momt.ji_exitstat);
     xmlNewChild(root_node, NULL, (xmlChar *)EXIT_STAT_TAG, (xmlChar *)buf);
     snprintf(buf, sizeof(buf), "%u", pjob->ji_qs.ji_un.ji_momt.ji_exuid);
     xmlNewChild(root_node, NULL, (xmlChar *)EXEC_UID_TAG, (xmlChar *)buf);
     snprintf(buf, sizeof(buf), "%u", pjob->ji_qs.ji_un.ji_momt.ji_exgid);
     xmlNewChild(root_node, NULL, (xmlChar *)EXEC_GID_TAG, (xmlChar *)buf);
     break;
   }
 } /* END add_union_fields */


xmlNodePtr add_resource_list_attribute(

  const char *nodeTag,     /* I tag to use on the resouce head-node */
  xmlNodePtr *attr_node,   /* M attribute head-node */ 
  xmlNodePtr *res_node,    /* M Resource_List/resources_used head-node */ 
  svrattrl   *pal)         /* I encoded attribute structure */

  {
  xmlNodePtr attributeHeadNode = *attr_node;
  xmlNodePtr resourceHeadNode = *res_node;
  xmlNodePtr resourceNode = NULL;

  if (!resourceHeadNode)
    if ((resourceHeadNode = xmlNewNode(NULL, (xmlChar *)nodeTag)))
      {
      xmlAddChild(attributeHeadNode, resourceHeadNode);
      *res_node = resourceHeadNode;
      }

  if (pal->al_atopl.resource)
    resourceNode = xmlNewChild(resourceHeadNode, NULL, (xmlChar *)pal->al_atopl.resource, (xmlChar *)pal->al_atopl.value);

  return resourceNode;
  } /* END add_resource_list_attribute() */



#ifndef PBS_MOM
/*
 * translate_dependency_to_string
 *
 * takes the dependency attribute and places it in a consumable string
 *
 * @param pattr - a pointer to the dependency attribute
 * @param value - the string to populate with the information
 */

void translate_dependency_to_string(

  pbs_attribute *pattr,
  std::string   &value)

  {
  struct depend             *dep;
  extern struct dependnames  dependnames[];
  struct dependnames        *dp_name;


  if (pattr == NULL)
    return;

  for (dep = (struct depend *)GET_NEXT(pattr->at_val.at_list);
       dep != NULL;
       dep = (struct depend *)GET_NEXT(dep->dp_link))
    {
    if((dep->dp_type >= JOB_DEPEND_NUMBER_TYPES)||(dep->dp_type < 0))
      {
      return; //We have a messed up dependency so get outta here.
      }

    dp_name = dependnames + dep->dp_type;

    if(dp_name->name == NULL)
      {
      return;
      }

    if (value.size() != 0)
      value += ",";

    value += dp_name->name;

    if ((dp_name->type == JOB_DEPEND_TYPE_SYNCCT) ||
        (dp_name->type == JOB_DEPEND_TYPE_ON))
      {
      char buf[128];
      snprintf(buf, sizeof(buf), ":%d", dep->dp_numexp);
      value += buf;
      }
    else
      {

      unsigned int dp_jobs_size = dep->dp_jobs.size();
      for (unsigned int i = 0; i < dp_jobs_size; i++)
        {
        depend_job *pdjob = dep->dp_jobs[i];
        value += ":";
        value += pdjob->dc_child;

        // Don't write out the server as it can mess up high availability scenarios.
        }
      }
    }

  } /* END translate_dependency_to_string() */
#endif



/*
 * add_encoded_attributes () - add encoded job attributes xml nodes. 
 */

int add_encoded_attributes(

  xmlNodePtr     *attr_node, /* M attribute node */ 
  pbs_attribute  *pattr)     /* M ptr to pbs_attribute value array */

  {
  tlist_head  lhead;
  int         i;
  int         resc_access_perm = ATR_DFLAG_ACCESS;
  svrattrl   *pal;
  int         rc = PBSE_NONE;
  xmlNodePtr  attributeNode = *attr_node;
  char        buf[BUFSIZE];
  xmlNodePtr  pal_xmlNode;

  CLEAR_HEAD(lhead);
  xmlNodePtr  resource_list_head_node = NULL;
  xmlNodePtr  resource_used_head_node = NULL;
  xmlNodePtr  complete_req_head_node = NULL;

  for (i = 0; ((i < JOB_ATR_LAST) && (rc >= 0)); i++)
    {
    if ((job_attr_def[i].at_type != ATR_TYPE_ACL) &&
        ((pattr + i)->at_flags & ATR_VFLAG_SET))
      {
      if ((i != JOB_ATR_resource) &&
          (i != JOB_ATR_resc_used) &&
          (i != JOB_ATR_req_information))
        {
        std::string value;

#ifndef PBS_MOM
        if (i == JOB_ATR_depend)
          translate_dependency_to_string(pattr + i, value);
        else
#endif
          attr_to_str(value, job_attr_def + i, pattr[i], true);

        if (value.size() == 0)
          continue;

        pal_xmlNode = xmlNewChild(attributeNode,
                                  NULL,
                                  (xmlChar *)job_attr_def[i].at_name,
                                  (const xmlChar *)value.c_str());

        if (pal_xmlNode)
          {
          snprintf(buf, sizeof(buf), "%u", (unsigned int)pattr[i].at_flags);
          xmlSetProp(pal_xmlNode, (const xmlChar *)AL_FLAGS_ATTR, (const xmlChar *)buf);
          (pattr + i)->at_flags &= ~ATR_VFLAG_MODIFY;
          }
        }
      else
        {
        rc = job_attr_def[i].at_encode(pattr + i,
            &lhead,
            job_attr_def[i].at_name,
            NULL,
            ATR_ENCODE_SAVE,
            resc_access_perm);
        
        if (rc < 0)
          return -1;

        (pattr + i)->at_flags &= ~ATR_VFLAG_MODIFY;

        while ((pal = (svrattrl *)GET_NEXT(lhead)) != NULL)
          {
          if (i == JOB_ATR_resource)
            {
            pal_xmlNode = add_resource_list_attribute(ATTR_l,
                                                      attr_node,
                                                      &resource_list_head_node,
                                                      pal);
            }
          else if (i == JOB_ATR_req_information)
            {
            pal_xmlNode = add_resource_list_attribute(ATTR_req_information,
                                                      attr_node,
                                                      &complete_req_head_node,
                                                      pal);
            }
          else
            {
            pal_xmlNode = add_resource_list_attribute(ATTR_used,
                                                      attr_node,
                                                      &resource_used_head_node,
                                                      pal);
            }

            if (pal_xmlNode)
              {
              snprintf(buf, sizeof(buf), "%u", (unsigned int)pal->al_flags);
              xmlSetProp(pal_xmlNode, (const xmlChar *)AL_FLAGS_ATTR, (const xmlChar *)buf);
              }

            delete_link(&pal->al_link);
            free(pal);
            if (!pal_xmlNode)
              rc = -1;
          }
        }
      }
    }

  return (0);
  } /* END add_encoded_attributes */


/*
 * add_attributes () - add xml nodes (that correspond to job's encoded attributes to the document.
 */

int add_attributes(

  xmlNodePtr *rnode, /* M document root node */
  job        *pjob)  /* pointer to job */

  {
  xmlNodePtr attributeNode = NULL;
  int rc = PBSE_NONE;
  char  log_buf[LOCAL_LOG_BUF_SIZE];
  xmlNodePtr root_node = *rnode;

  if ((attributeNode = xmlNewNode(NULL, (xmlChar *)ATTRIB_TAG)))
   {
   xmlAddChild(root_node, attributeNode);
   rc = add_encoded_attributes(&attributeNode, pjob->ji_wattr);
   }
  else
   rc = -1;

  if (rc != PBSE_NONE)
   {
   snprintf(log_buf, sizeof(log_buf), "could not add the job attributes to the XML doc");
   log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
   rc = -1;     
   }
  
  return(rc);
  } /* END add_attributes */


#ifdef PBS_MOM
/*
 * add_mom_fields() - add tm sockets xml nodes to the document
 */

void add_mom_fields(

  xmlNodePtr *rnode,
  const job *pjob)

  {
  char buf[BUFSIZE];
  xmlNodePtr root_node = *rnode;

  snprintf(buf, sizeof(buf), "%d", pjob->ji_stdout);
  xmlNewChild(root_node, NULL, (xmlChar *)STDOUT_TAG, (xmlChar *)buf);
  snprintf(buf, sizeof(buf), "%d", pjob->ji_stderr);
  xmlNewChild(root_node, NULL, (xmlChar *)STDERR_TAG, (xmlChar *)buf);
  snprintf(buf, sizeof(buf), "%d", pjob->ji_taskid);
  xmlNewChild(root_node, NULL, (xmlChar *)TASKID_TAG, (xmlChar *)buf);
  snprintf(buf, sizeof(buf), "%d", pjob->ji_nodeid);
  xmlNewChild(root_node, NULL, (xmlChar *)NODEID_TAG, (xmlChar *)buf);
  }
#endif /* PBS_MOM */

/*
 * saveJobToXML() - save job to disk in xml format
 */

int saveJobToXML(

  job *pjob,      /* I - pointer to job */
  const char *filename) /* I - filename to save to */

  {
  xmlDocPtr  doc = NULL;       /* document pointer */
  xmlNodePtr root_node = NULL;
  int        lenwritten = 0, rc = PBSE_NONE;
  char       log_buf[LOCAL_LOG_BUF_SIZE];

  if ((doc = xmlNewDoc((const xmlChar*) "1.0")))
    {
    root_node = xmlNewNode(NULL, (const xmlChar*) JOB_TAG);
    xmlDocSetRootElement(doc, root_node);
    add_fix_fields(&root_node, (const job*)pjob);
    add_union_fields(&root_node, (const job*)pjob);

    if (add_attributes(&root_node, pjob))
      {
      xmlFreeDoc(doc);
      return -1;
      }

#ifdef PBS_MOM
    add_mom_fields(&root_node, (const job*)pjob);
#endif /* PBS_MOM */

#ifndef PBS_MOM
    lock_ss();
#endif /* !defined PBS_MOM */

    lenwritten = xmlSaveFormatFileEnc(filename, doc, NULL, 1);

#ifndef PBS_MOM
    unlock_ss();
#endif /* !defined PBS_MOM */

    xmlFreeDoc(doc);
    }
  else
    {
    snprintf(log_buf, sizeof(log_buf), "could not create a new xml document");
    log_event(
    PBSEVENT_JOB,
    PBS_EVENTCLASS_JOB,
    pjob->ji_qs.ji_jobid,
    log_buf);
    rc = -1;     
    }

  if (lenwritten <= 0)
    {
    snprintf(log_buf, sizeof(log_buf), "failed writing job to the xml file %s", filename);
    log_event(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid,
      log_buf);

    rc = -1;
    }

  return(rc);
  } /* saveJobToXML */


/*
 * job_save() - Saves (or updates) a job structure image on disk
 *
 * Save does either - a quick update for state changes only,
 *    - a full update for an existing file, or
 *    - a full write for a new job
 *
 * For a quick update, the data written is less than a disk block
 * size and no size change occurs; so it is rewritten in place.
 *
 * For a full update (usually following modify job request), to
 * insure no data is ever lost due to system crash:
 * 1. write new image to a new file using a temp name
 * 2. unlink the old (image) file
 * 3. link the correct name to the new file
 * 4. unlink the temp name
 *
 * For a new file write, first time, the data is written directly to
 * the file.
 *
 *      RETURN:  0 - success, -1 - failure
 */

int job_save(

  job *pjob,  /* pointer to job structure */
  int  updatetype, /* 0=quick, 1=full, 2=new     */
  int  mom_port)   /* if 0 ignore otherwise append to end of job name. this is for multi-mom mode */

  {
  pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, 0);

  char    namebuf1[MAXPATHLEN];
  char    namebuf2[MAXPATHLEN];
  const char   *tmp_ptr = NULL;

  time_t  time_now = time(NULL);
#ifndef PBS_MOM
  // get the adjusted path_jobs path
  std::string   adjusted_path_jobs = get_path_jobdata(pjob->ji_qs.ji_jobid, path_jobs);
#endif


#ifdef PBS_MOM
  tmp_ptr = JOB_FILE_SUFFIX;
#else
  if (pjob->ji_is_array_template == TRUE)
    tmp_ptr = (char *)JOB_FILE_TMP_SUFFIX;
  else
    tmp_ptr = (char *)JOB_FILE_SUFFIX;
#endif

  if (mom_port)
    {
#ifdef PBS_MOM
    snprintf(namebuf1, MAXPATHLEN, "%s%s%d%s",
        path_jobs, pjob->ji_qs.ji_fileprefix, mom_port, tmp_ptr);
    snprintf(namebuf2, MAXPATHLEN, "%s%s%d%s",
        path_jobs, pjob->ji_qs.ji_fileprefix, mom_port, JOB_FILE_COPY);
#else
    snprintf(namebuf1, MAXPATHLEN, "%s%s%d%s",
        adjusted_path_jobs.c_str(), pjob->ji_qs.ji_fileprefix, mom_port, tmp_ptr);
    snprintf(namebuf2, MAXPATHLEN, "%s%s%d%s",
        adjusted_path_jobs.c_str(), pjob->ji_qs.ji_fileprefix, mom_port, JOB_FILE_COPY);
#endif
    }
  else
    {
#ifdef PBS_MOM
    snprintf(namebuf1, MAXPATHLEN, "%s%s%s",
        path_jobs, pjob->ji_qs.ji_fileprefix, tmp_ptr);
    snprintf(namebuf2, MAXPATHLEN, "%s%s%s",
        path_jobs, pjob->ji_qs.ji_fileprefix, JOB_FILE_COPY);
#else
    snprintf(namebuf1, MAXPATHLEN, "%s%s%s",
        adjusted_path_jobs.c_str(), pjob->ji_qs.ji_fileprefix, tmp_ptr);
    snprintf(namebuf2, MAXPATHLEN, "%s%s%s",
        adjusted_path_jobs.c_str(), pjob->ji_qs.ji_fileprefix, JOB_FILE_COPY);
#endif
    }

  /* if ji_modified is set, ie an pbs_attribute changed, then update mtime */

  if (pjob->ji_modified)
    {
    pjob->ji_wattr[JOB_ATR_mtime].at_val.at_long = time_now;
    }

  if (!(saveJobToXML(pjob, namebuf2)))
    {
    unlink(namebuf1);

    if (link(namebuf2, namebuf1) == -1)
      {
      log_event(
        PBSEVENT_ERROR | PBSEVENT_SECURITY,
        PBS_EVENTCLASS_JOB,
        pjob->ji_qs.ji_jobid,
        (char *)"Link in job_save failed");
      }
    else
      {
      unlink(namebuf2);
      }
    }
  else /* saveJobToXML failed */
    {
    log_event(PBSEVENT_ERROR | PBSEVENT_SECURITY, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid,
      "call to saveJobToXML in job_save failed");
    pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, 0);
    return -1;
    }

  pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, 0);

  return(PBSE_NONE);
  }  /* END job_save() */


#ifndef PBS_MOM
/*
 * ghost_create_jobs_array()
 *
 * Automatically creates an array for pjob so that it doesn't have to be deleted. This job's array
 * was not properly recovered, and we don't want to lose the jobs.
 *
 * @param pjob - the job whose array wasn't recovered.
 * @param array_id - the id of the array to be created.
 * @return - a pointer to the new job array
 */

job_array *ghost_create_jobs_array(

  job        *pjob,
  const char *array_id)

  {
  job_array   *pa = (job_array *)calloc(1,sizeof(job_array));
  long         array_size = DEFAULT_ARRAY_RECOV_SIZE;
  char         log_buf[LOCAL_LOG_BUF_SIZE];
  char         file_prefix_work[PBS_JOBBASE + 1];
  long         slot_limit = NO_SLOT_LIMIT;

  if (LOGLEVEL >= 2)
    {
    snprintf(log_buf, sizeof(log_buf),
      "Array %s was not successfully recovered, but we are creating it automatically to not lose the sub-jobs. Slot limits and or dependencies may not work correctly. This behavior can be disabled by setting ghost_array_recovery to false in qmgr.", array_id);
    log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
    }
  
  pa->ai_qs.struct_version = ARRAY_QS_STRUCT_VERSION;
  CLEAR_HEAD(pa->request_tokens);
  CLEAR_HEAD(pa->ai_qs.deps);
  pa->ai_mutex = (pthread_mutex_t *)calloc(1, sizeof(pthread_mutex_t));
  pthread_mutex_init(pa->ai_mutex,NULL);

  lock_ai_mutex(pa, __func__, NULL, LOGLEVEL);

  strcpy(pa->ai_qs.parent_id, array_id);
  
  get_svr_attr_l(SRV_ATR_MaxSlotLimit, &slot_limit);
  pa->ai_qs.slot_limit = slot_limit;

  // Remove the [] from the file prefix
  snprintf(file_prefix_work, sizeof(file_prefix_work), "%s", array_id);
  char *open = strchr(file_prefix_work, '[');
  char *dot = strchr(file_prefix_work, '.');

  if (open != NULL)
    {
    *open = '\0';
    if (dot != NULL)
      snprintf(pa->ai_qs.fileprefix, sizeof(pa->ai_qs.fileprefix), "%s%s", file_prefix_work, dot);
    else
      snprintf(pa->ai_qs.fileprefix, sizeof(pa->ai_qs.fileprefix), "%s", file_prefix_work);
    }
  else
    snprintf(pa->ai_qs.fileprefix, sizeof(pa->ai_qs.fileprefix), "%s", file_prefix_work);

  snprintf(pa->ai_qs.owner, sizeof(pa->ai_qs.owner), "%s", 
    pjob->ji_wattr[JOB_ATR_job_owner].at_val.at_str);
  snprintf(pa->ai_qs.submit_host, sizeof(pa->ai_qs.submit_host), "%s",
    get_variable(pjob, pbs_o_host));
    
  if (pjob->ji_wattr[JOB_ATR_job_array_id].at_val.at_long > array_size)
    array_size = pjob->ji_wattr[JOB_ATR_job_array_id].at_val.at_long + 1;

  pa->job_ids = (char **)calloc(array_size, sizeof(char *));
  pa->job_ids[(int)pjob->ji_wattr[JOB_ATR_job_array_id].at_val.at_long] = strdup(pjob->ji_qs.ji_jobid);

  pa->ai_qs.array_size = array_size;
  pa->ai_ghost_recovered = true;
  array_save(pa);

  /* link the struct into the servers list of job arrays */
  insert_array(pa);

  return(pa);
  } // END ghost_create_jobs_array()



/*
 * check_and_reallocate_job_ids()
 *
 * Reallocates the job_ids array of pa if necessary to make space for the new job id at index
 *
 * @param pa - the job array that we're checking to make sure has enough space for this job id
 * @param index - the index of the new job id
 */

void check_and_reallocate_job_ids(

  job_array *pa,
  int        index)

  {
  if (pa->ai_qs.array_size <= index)
    {
    int new_size = pa->ai_qs.array_size * 2;

    while (new_size <= index)
      new_size *= 2;

    char **new_ids = (char **)calloc(new_size, sizeof(char *));
    memcpy(new_ids, pa->job_ids, (sizeof(char *) * pa->ai_qs.array_size));
    free(pa->job_ids);

    pa->job_ids = new_ids;
    pa->ai_qs.array_size = new_size;
    }
  } // END check_and_reallocate_job_ids()



/*
 * update_recovered_array_values()
 *
 * Updates the internal counts for pa to account for this job
 *
 * @param pa - the array
 * @param pjob - the job
 */
void update_recovered_array_values(

  job_array *pa,
  job       *pjob)

  {
  pa->ai_qs.num_jobs++;

  switch (pjob->ji_qs.ji_state)
    {
    case JOB_STATE_RUNNING:

      pa->ai_qs.num_started++;
      pa->ai_qs.jobs_running++;

      break;

    case JOB_STATE_COMPLETE:

      pa->ai_qs.num_started++;
      pa->ai_qs.jobs_done++;

      if (pjob->ji_wattr[JOB_ATR_exitstat].at_val.at_long == 0)
        pa->ai_qs.num_successful++;
      else
        pa->ai_qs.num_failed++;

      break;
    }

  } // END update_recovered_array_values()
#endif



/*
 * set_array_job_ids()
 *
 * Updates the array struct with pjob's information
 *
 * @param pjob - a pointer to the pointer to the job
 * @param log_buf - a buffer for logging
 * @param buflen - the size of the buffer
 */

int set_array_job_ids(

  job  **pjob,       /* M */
  char  *log_buf,    /* error Buffer */
  size_t buflen)     /* error buffer length */

  {
  int rc = PBSE_NONE;
#ifndef PBS_MOM
  job *pj = *pjob;
  job_array *pa;
  char       parent_id[PBS_MAXSVRJOBID + 1];

  // If this variable isn't set this job isn't actually an array subjob.
  if ((pj->ji_wattr[JOB_ATR_job_array_id].at_flags & ATR_VFLAG_SET) == 0)
    {
    // Check and set if this is the array template job
    char *open_bracket = strchr(pj->ji_qs.ji_jobid, '[');
    if (open_bracket != NULL)
      {
      if (*(open_bracket + 1) == ']')
        pj->ji_is_array_template = TRUE;
      else
        return(rc);
      }
    else
      return(rc);
    }

  if (strchr(pj->ji_qs.ji_jobid, '[') != NULL)
    {
    /* job is part of an array.  We need to put a link back to the server
    job array struct for this array. We also have to link this job into
    the linked list of jobs belonging to the array. */

    array_get_parent_id(pj->ji_qs.ji_jobid, parent_id);
    pa = get_array(parent_id);
    if (pa == NULL)
      {
      if (ghost_array_recovery)
        {
        pa = ghost_create_jobs_array(pj, parent_id);
        }
      else
        {
        snprintf(log_buf, buflen, "array struct missing for array job %s", pj->ji_qs.ji_jobid);
        job_abt(&pj, "Array job missing array struct, aborting job");
        return(-1);
        }
      }

    strcpy(pj->ji_arraystructid, parent_id);

    if (strcmp(parent_id, pj->ji_qs.ji_jobid) == 0)
      {
      pj->ji_is_array_template = TRUE;
      }
    else
      {
      // If the original array wasn't recovered, then we don't know if we have the right size for
      // job_ids. Check and ensure that it's big enough.
      if (pa->ai_ghost_recovered)
        {
        check_and_reallocate_job_ids(pa, pj->ji_wattr[JOB_ATR_job_array_id].at_val.at_long);
        update_recovered_array_values(pa, pj);
        }

      pa->job_ids[(int)pj->ji_wattr[JOB_ATR_job_array_id].at_val.at_long] = strdup(pj->ji_qs.ji_jobid);
      pa->jobs_recovered++;

      /* This is a bit of a kluge, but for some reason if an array job was 
         on hold when the server went down the ji_wattr[JOB_ATR_hold].at_val.at_long
         value is 0 on recovery even though pj->ji_qs.ji_state is JOB_STATE_HELD and
         the substate is JOB_SUBSTATE_HELD
      */
      if ((pj->ji_qs.ji_state == JOB_STATE_HELD) &&
          (pj->ji_qs.ji_substate == JOB_SUBSTATE_HELD))
        {
        pj->ji_wattr[JOB_ATR_hold].at_val.at_long = HOLD_l;
        pj->ji_wattr[JOB_ATR_hold].at_flags = ATR_VFLAG_SET;
        }
      }

    if (pa != NULL)
      {
      unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
      }
    }
#endif /* !PBS_MOM */

  return(rc);
  }


int job_recov_xml(

  const char  *filename,  /* I */   /* pathname to job save file */
  job        **pjob,     /* M */   /* pointer to a pointer of job structure to fill info */
  char        *log_buf,   /* O */   /* buffer to hold error message */
  size_t       buf_len)  /* I */   /* len of the error buffer */

  {
  xmlDoc *doc = NULL;
  xmlNode *root_element = NULL;
  int rc = PBSE_INVALID_SYNTAX;

  /*parse the file and get the DOM */
  doc = xmlReadFile(filename, NULL, 0);

  if (doc == NULL)
    return(rc);

  /*Get the root element node */
  root_element = xmlDocGetRootElement(doc);
  if (strcmp((const char *) root_element->name, JOB_TAG))
    {
    snprintf(log_buf, buf_len, "missing root tag %s in xml", JOB_TAG);
   
    /* set return code of -1 as we do have a JB xml but it did not have the right root elem. */
    xmlFreeDoc(doc);
    return(-1);
    }

  if (parse_job_dom(filename, pjob, root_element, log_buf, buf_len))
    {
    xmlFreeDoc(doc);
    return(-1);
    }
  
  xmlFreeDoc(doc);

  return(PBSE_NONE);
  } /* END job_recov_xml */


/*
 * binary_job_recov() - recover (read in) a job from its save file
 *
 * This function is only needed upon server start up.
 *
 * The job structure, its attributes strings, and its dependencies
 * are recovered from the disk.  Space to hold the above is
 * calloc-ed as needed.
 *
 * Returns: job pointer to new job structure or a
 *   null pointer on an error.
*/

int job_recov_binary(

  const char *filename,  /* I */   /* pathname to job save file */
  job  **pjob,     /* M */   /* pointer to a pointer of job structure to fill info */
  char *log_buf,   /* O */   /* buffer to hold error message */
  size_t buf_len)  /* I */   /* len of the error buffer */

  {
  int  fds;
  job  *pj = *pjob;
  char *pn;

#ifdef PBS_MOM
  char fileid[MAXPATHLEN];
#endif

  fds = open(filename, O_RDONLY, 0);

  if (fds < 0)
    {
    snprintf(log_buf, buf_len, "unable to open %s", filename);
    return -1;
    }

  /* read in job quick save sub-structure */

  if (read_ac_socket(fds, (char *)&pj->ji_qs, sizeof(pj->ji_qs)) != sizeof(pj->ji_qs) &&
      pj->ji_qs.qs_version == PBS_QS_VERSION)
    {
    snprintf(log_buf, buf_len, "Unable to read %s", filename);
    close(fds);
    return -1;
    }

  /* is ji_qs the version we expect? */
  if (pj->ji_qs.qs_version != PBS_QS_VERSION)
    {
    /* ji_qs is older version */
    snprintf(log_buf, buf_len,
      "%s appears to be from an old version. Attempting to convert.\n",
      filename);
    log_err(-1, __func__, log_buf);

    if (job_qs_upgrade(pj, fds, (char *)filename, pj->ji_qs.qs_version) != 0)
      {
      snprintf(log_buf, buf_len, "unable to upgrade %s\n", filename);
      close(fds);
      return -1;
      }
    }  /* END if (pj->ji_qs.qs_version != PBS_QS_VERSION) */

  /* Does file name match the internal name? */
  /* This detects ghost files */

  // see if filename has a leading path
  pn = strrchr((char *)filename, (int)'/');

  if (pn == NULL)
    {
    // file name had no leading path so just point to beginning of string
    pn = (char *)filename;
    }
  else
    {
    // file name had a leading path so skip over it
    pn++;
    }

#ifndef PBS_MOM
  if (strncmp(pn, pj->ji_qs.ji_fileprefix, strlen(pj->ji_qs.ji_fileprefix)) != 0)
#else
  if(multi_mom != 0)
    {
    sprintf(fileid,"%s%d",pj->ji_qs.ji_fileprefix,pbs_rm_port);
    }
  else
    {
    strcpy(fileid,pj->ji_qs.ji_fileprefix);
    }
  if (strncmp(pn, fileid, strlen(fileid)) != 0)
#endif
    {
    /* mismatch, discard job */

    snprintf(log_buf, buf_len, "Job Id %s does not match file name for %s",
      pj->ji_qs.ji_jobid,
      filename);

    close(fds);
    return -1;
    }

  /* read in working attributes */

  if (recov_attr(
        fds,
        pj,
        job_attr_def,
        pj->ji_wattr,
        JOB_ATR_LAST,
        JOB_ATR_UNKN,
        TRUE) != 0) 
    {
    snprintf(log_buf, buf_len, "unable to recover %s (file is likely corrupted)", filename);
    close(fds);
    return -1;
    }

#ifdef PBS_MOM
  /* read in tm sockets and ips */

  if (recov_tmsock(fds, pj) != 0)
    {
    snprintf(log_buf, buf_len,
        "warning: tmsockets not recovered from %s (written by an older pbs_mom?)",
        filename);

    log_err(-1, __func__, log_buf);
    }
#endif /* PBS_MOM */

  close(fds);

  return PBSE_NONE;
  }  /* END job_recov_binary() */





/*
 * job_recov() - recover (read in) a job from its save file
 *
 * This function is only needed upon server start up.
 *
 * The job structure, its attributes strings, and its dependencies
 * are recovered from the disk.  Space to hold the above is
 * calloc-ed as needed.
 *
 * Returns: job pointer to new job structure or a
 *   null pointer on an error.
*/

job *job_recov(

  const char *filename) /* I */   /* pathname to job save file */

  {
  job  *pj;
  char  log_buf[LOCAL_LOG_BUF_SIZE];
  int   rc;
#ifdef PBS_MOM
  char namebuf[MAXPATHLEN];
#endif

  pj = job_alloc(); /* allocate & initialize job structure space */

  if (pj == NULL)
    {
    /* FAILURE - cannot alloc memory */

    return(NULL);
    }

  size_t logBufLen = sizeof(log_buf);
#ifdef PBS_MOM
  // job directory path, filename
  snprintf(namebuf, MAXPATHLEN, "%s%s", path_jobs, filename);
  
  if ((rc = job_recov_xml(namebuf, &pj, log_buf, logBufLen)) &&
      (rc == PBSE_INVALID_SYNTAX))
    rc = job_recov_binary(namebuf, &pj, log_buf, logBufLen);
#else
  if ((rc = job_recov_xml(filename, &pj, log_buf, logBufLen)) &&
      (rc == PBSE_INVALID_SYNTAX))
    rc = job_recov_binary(filename, &pj, log_buf, logBufLen);

  if (rc == PBSE_NONE)
    rc = set_array_job_ids(&pj, log_buf, logBufLen);
#endif


  if (rc != PBSE_NONE) 
    {
    if (rc == -1) 
      {
      log_err(errno, __func__, log_buf);

#ifndef PBS_MOM
      unlock_ji_mutex(pj, __func__, "1", LOGLEVEL);
      free(pj->ji_mutex);
#endif
      free((char *)pj);
      } /* sometime pjob is freed by abt_job() */
    return(NULL);
    }
  
  
  pj->ji_commit_done = 1;

  /* all done recovering the job */

#ifdef PBS_MOM
  job_save(pj, SAVEJOB_FULL, (multi_mom == 0)?0:pbs_rm_port);
#else
  job_save(pj, SAVEJOB_FULL, 0);
#endif

  return(pj);
  }  /* END job_recov() */
