import os
import re
import subprocess
import sys

import json

from collections import defaultdict

import xml.etree.ElementTree as ET  # This is used to read configuration XML

import pandas as pd

from projects_config import projects

# This regular expression will match 'open' system calls and extract information
#  such as the filepath, mode, and any error message. An example of strace's output is:
# open("Config/Algorithms/CLOUD_HEIGHT_EN/CLOUD_HEIGHT_EN.xml", O_RDONLY) = 4
#  In this example, the mode is O_RDONLY, the code is 4, num is None, and msg is None.
#  TODO: what is the meaning of num? It may be an octal mode (a la chmod)?
strace_prog = re.compile(r'''
    ^open\(                       # The system call open
    "(?P<filepath>[\w\+\-\./]+)"  # The filepath is in double-quotes
    ,\s*                          # Comma, whitespace
    (?P<mode>[\w\|]+)             # mode will usually be O_RDONLY or O_RDWR
    (?:,\s*(?P<num>\d+))?         # open may have an additional numerical argument
    \) \s* = \s*                  # End of open
    (?P<code>\-?\d+)              # Return code from open, -1 if error
    (?P<msg>.*)$                  # An error message when code is -1
    ''', re.VERBOSE)
def parse_strace(st):
  for l in st:
    if not l or '+++ exited with' in l or '/dev/shm/' in l:
      continue
    strace_match = strace_prog.match(l)
    if strace_match is None:
      print('!!! Could not match strace line: {}'.format(l))
      continue
    yield os.path.abspath(strace_match.group('filepath')), strace_match.group('mode'), strace_match.group('code')

def process_strace(st, **kwargs):
  result = []
  for fp, m, c in parse_strace(st.split('\n')):
    o = dict(filepath=fp, mode=m, code=c)
    o.update(kwargs)
    result.append(o)
  return result

def run_fw_mode(config_filename, fw_mode, log_filename, seg=None, p_env=None):
  job = 'trace_system_io_job'
  fw_mode_opt = '-m {}'.format(fw_mode) if fw_mode is not None else ''
  seg = seg or 1
  if fw_mode == 'seg' and seg > 0:
    fw_mode_opt += ' -s {}'.format(seg)
  cmd = 'strace -f -e open ./framework.exe -c {config} -j {jobid} {fw_mode_opt} 2>&1'
  cmd += ' >{log}' if fw_mode == 'pre' else ' >>{log}'  # Start a new log if preprocessing, otherwise append FIXME
  cmd = cmd.format(config=config_filename, jobid=job, fw_mode_opt=fw_mode_opt, seg=seg, log=log_filename)
  print('  Executing shell command `%s`' % cmd)
  try:
    strace_output = subprocess.check_output(cmd, shell=True, universal_newlines=True, env=p_env)
  except subprocess.CalledProcessError as e:
    print("!!! subprocess call ({}) gave return code {}".format(fw_mode, e.returncode))
  return strace_output

def run_fw(config, output_prefix, p_env=None, **kwargs):
  config_filename = output_prefix + '.xml'
  log_filename = output_prefix + '.log'
  sysopen_filename = output_prefix + '.csv'
  env_filename = output_prefix + '.json'

  config.write(config_filename)  # Record the configuration XML used
  for elem in ['ENV_INPUT_LIST', 'ENV_OUTPUT_LIST']:
    print('  %s: %s' % (elem, config.find(elem).text))
  with open(env_filename, 'w') as f:
    json.dump(p_env, f, indent=2)  # Record the environment used

  file_openings = []

  ### Preprocessing
  file_openings += process_strace(
      run_fw_mode(config_filename, 'pre', log_filename, p_env=p_env),
      fw_mode='pre', **kwargs)
  ### Segment-processing
  try:
    total_segments = int(p_env['ENV_NUM_COL_SEG'])*int(p_env['ENV_NUM_ROW_SEG'])
    segments = range(1, 1+total_segments)
  except KeyError:
    segments = (None,)  # TODO For now, just do the first segment, but we can get segmentation info from configuration XML
  for _s in segments:
    file_openings += process_strace(
        run_fw_mode(config_filename, 'seg', log_filename, p_env=p_env, seg=_s),
        fw_mode='seg', seg=_s, **kwargs)
  ### Postprocessing
  file_openings += process_strace(
      run_fw_mode(config_filename, 'post', log_filename, p_env=p_env),
      fw_mode='post', **kwargs)

  opens_df = pd.DataFrame(file_openings)
  with open(sysopen_filename, 'w') as f:
    opens_df.to_csv(sysopen_filename)
  return opens_df

def run_all(config, algorithms, p, p_env=None):
  input_list = config.find('ENV_INPUT_LIST')
  if input_list is None:
    input_list = ET.Element('ENV_INPUT_LIST', attrib={'class': 'string-array'})
    config.getroot().insert(3, input_list)
  input_dir = config.find('ENV_INPUT_DIRECTORY') or config.find('input_directory')
  output_dir = config.find('ENV_OUTPUT_DIRECTORY') or config.find('output_directory')

  output_list = config.find('ENV_OUTPUT_LIST')
  if output_list is None:  # FIXME figure out a better way to do this
    output_list = ET.Element('ENV_OUTPUT_LIST', attrib={'class': 'string-array'})
    output_list.text = ','.join(algorithms)
    config.getroot().insert(3, output_list)

  ait_ts = config.find('AIT_FILE_TIME_STAMP')
  if ait_ts is None:
    ait_ts = ET.Element('ENV_AIT_FILE_TIME_STAMP', attrib={'class': 'string'})
    ait_ts.text = '${ENV_AIT_FILE_TIME_STAMP}'
    config.getroot().insert(3, ait_ts)
    p_env['ENV_AIT_FILE_TIME_STAMP'] = "{year=2019;month=01;day=1;hour=11;min=11;sec=42.4}"

  print('\nAlgorithms to test are:')
  for a in sorted(algorithms):
    print('  %s' % a)

  # Phase 1: run the Framework with no input files.
  #  In other words, compute all algorithms' results from scratch and write them to files
  input_dir.text = './Input/'  # Shouldn't get used, but CFG complains if the content doesn't exist
  output_dir.text = './EN_MASSE_OUTPUT/{}/'.format(p)
  os.makedirs(output_dir.text, exist_ok=True)  #TODO be careful about paths and cwd
  input_list.text = ''
  output_list.text = ','.join(algorithms)
  print('\nTesting algorithms en masse...')
  run_fw(
      config,
      output_dir.text + 'en_masse',
      p_env,
      algorithm='EN_MASSE', 
      project=p)

  # Phase 2: for each algorithm, run the Framework using the phase 1 output of all other
  #  algorithms as input.
  input_dir.text = './EN_MASSE_OUTPUT/{}/'.format(p)
  output_dir.text = './SINGULAR_OUTPUT/{}/'.format(p)
  os.makedirs(input_dir.text, exist_ok=True)
  os.makedirs(output_dir.text, exist_ok=True)
  for a in algorithms:
    input_list.text = ','.join(algorithms - set([a]))
    output_list.text = a
    print('\nTesting algorithm %s...' % a)
    run_fw(
        config,
        output_dir.text + a,
        p_env,
        algorithm=a,
        project=p)

def main(algorithms):
  for p, p_env in projects.items():
    if p != 'AIT_VIIRS':  #FIXME remove after fixing AIT time stamps
      continue
    print('\n == {} == '.format(p))
    config = ET.parse('Config/Projects/{}/Config.xml'.format(p))
    run_all(config, set(algorithms), p, dict(os.environ, **p_env))

if __name__ == '__main__':
  main('CRTM,SFC_EMISS_SEEBOR'.split(','))