import os import re import subprocess import sys import json from collections import defaultdict import xml.etree.ElementTree as ET # This is used to read configuration XML import pandas as pd from projects_config import projects # This regular expression will match 'open' system calls and extract information # such as the filepath, mode, and any error message. An example of strace's output is: # open("Config/Algorithms/CLOUD_HEIGHT_EN/CLOUD_HEIGHT_EN.xml", O_RDONLY) = 4 # In this example, the mode is O_RDONLY, the code is 4, num is None, and msg is None. # TODO: what is the meaning of num? It may be an octal mode (a la chmod)? strace_prog = re.compile(r''' ^open\( # The system call open "(?P[\w\+\-\./]+)" # The filepath is in double-quotes ,\s* # Comma, whitespace (?P[\w\|]+) # mode will usually be O_RDONLY or O_RDWR (?:,\s*(?P\d+))? # open may have an additional numerical argument \) \s* = \s* # End of open (?P\-?\d+) # Return code from open, -1 if error (?P.*)$ # An error message when code is -1 ''', re.VERBOSE) def parse_strace(st): for l in st: if not l or '+++ exited with' in l or '/dev/shm/' in l: continue strace_match = strace_prog.match(l) if strace_match is None: print('!!! Could not match strace line: {}'.format(l)) continue yield os.path.abspath(strace_match.group('filepath')), strace_match.group('mode'), strace_match.group('code') def process_strace(st, **kwargs): result = [] for fp, m, c in parse_strace(st.split('\n')): o = dict(filepath=fp, mode=m, code=c) o.update(kwargs) result.append(o) return result def run_fw_mode(config_filename, fw_mode, log_filename, seg=None, p_env=None): job = 'trace_system_io_job' fw_mode_opt = '-m {}'.format(fw_mode) if fw_mode is not None else '' seg = seg or 1 if fw_mode == 'seg' and seg > 0: fw_mode_opt += ' -s {}'.format(seg) cmd = 'strace -f -e open ./framework.exe -c {config} -j {jobid} {fw_mode_opt} 2>&1' cmd += ' >{log}' if fw_mode == 'pre' else ' >>{log}' # Start a new log if preprocessing, otherwise append FIXME cmd = cmd.format(config=config_filename, jobid=job, fw_mode_opt=fw_mode_opt, seg=seg, log=log_filename) print(' Executing shell command `%s`' % cmd) try: strace_output = subprocess.check_output(cmd, shell=True, universal_newlines=True, env=p_env) except subprocess.CalledProcessError as e: print("!!! subprocess call ({}) gave return code {}".format(fw_mode, e.returncode)) return strace_output def run_fw(config, output_prefix, p_env=None, **kwargs): config_filename = output_prefix + '.xml' log_filename = output_prefix + '.log' sysopen_filename = output_prefix + '.csv' env_filename = output_prefix + '.json' config.write(config_filename) # Record the configuration XML used for elem in ['ENV_INPUT_LIST', 'ENV_OUTPUT_LIST']: print(' %s: %s' % (elem, config.find(elem).text)) with open(env_filename, 'w') as f: json.dump(p_env, f, indent=2) # Record the environment used file_openings = [] ### Preprocessing file_openings += process_strace( run_fw_mode(config_filename, 'pre', log_filename, p_env=p_env), fw_mode='pre', **kwargs) ### Segment-processing try: total_segments = int(p_env['ENV_NUM_COL_SEG'])*int(p_env['ENV_NUM_ROW_SEG']) segments = range(1, 1+total_segments) except KeyError: segments = (None,) # TODO For now, just do the first segment, but we can get segmentation info from configuration XML for _s in segments: file_openings += process_strace( run_fw_mode(config_filename, 'seg', log_filename, p_env=p_env, seg=_s), fw_mode='seg', seg=_s, **kwargs) ### Postprocessing file_openings += process_strace( run_fw_mode(config_filename, 'post', log_filename, p_env=p_env), fw_mode='post', **kwargs) opens_df = pd.DataFrame(file_openings) with open(sysopen_filename, 'w') as f: opens_df.to_csv(sysopen_filename) return opens_df def run_all(config, algorithms, p, p_env=None): input_list = config.find('ENV_INPUT_LIST') if input_list is None: input_list = ET.Element('ENV_INPUT_LIST', attrib={'class': 'string-array'}) config.getroot().insert(3, input_list) input_dir = config.find('ENV_INPUT_DIRECTORY') or config.find('input_directory') output_dir = config.find('ENV_OUTPUT_DIRECTORY') or config.find('output_directory') output_list = config.find('ENV_OUTPUT_LIST') if output_list is None: # FIXME figure out a better way to do this output_list = ET.Element('ENV_OUTPUT_LIST', attrib={'class': 'string-array'}) output_list.text = ','.join(algorithms) config.getroot().insert(3, output_list) ait_ts = config.find('AIT_FILE_TIME_STAMP') if ait_ts is None: ait_ts = ET.Element('ENV_AIT_FILE_TIME_STAMP', attrib={'class': 'string'}) ait_ts.text = '${ENV_AIT_FILE_TIME_STAMP}' config.getroot().insert(3, ait_ts) p_env['ENV_AIT_FILE_TIME_STAMP'] = "{year=2019;month=01;day=1;hour=11;min=11;sec=42.4}" print('\nAlgorithms to test are:') for a in sorted(algorithms): print(' %s' % a) # Phase 1: run the Framework with no input files. # In other words, compute all algorithms' results from scratch and write them to files input_dir.text = './Input/' # Shouldn't get used, but CFG complains if the content doesn't exist output_dir.text = './EN_MASSE_OUTPUT/{}/'.format(p) os.makedirs(output_dir.text, exist_ok=True) #TODO be careful about paths and cwd input_list.text = '' output_list.text = ','.join(algorithms) print('\nTesting algorithms en masse...') run_fw( config, output_dir.text + 'en_masse', p_env, algorithm='EN_MASSE', project=p) # Phase 2: for each algorithm, run the Framework using the phase 1 output of all other # algorithms as input. input_dir.text = './EN_MASSE_OUTPUT/{}/'.format(p) output_dir.text = './SINGULAR_OUTPUT/{}/'.format(p) os.makedirs(input_dir.text, exist_ok=True) os.makedirs(output_dir.text, exist_ok=True) for a in algorithms: input_list.text = ','.join(algorithms - set([a])) output_list.text = a print('\nTesting algorithm %s...' % a) run_fw( config, output_dir.text + a, p_env, algorithm=a, project=p) def main(algorithms): for p, p_env in projects.items(): if p != 'AIT_VIIRS': #FIXME remove after fixing AIT time stamps continue print('\n == {} == '.format(p)) config = ET.parse('Config/Projects/{}/Config.xml'.format(p)) run_all(config, set(algorithms), p, dict(os.environ, **p_env)) if __name__ == '__main__': main('CRTM,SFC_EMISS_SEEBOR'.split(','))