#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Copyright 2018 Cognitive Computation Lab
University of Freiburg
Lukas Elflein <elfleinl@cs.uni-freiburg.de>
"""

import pandas as pd
import numpy as np


def get_data(file_name, syllogs):
    '''
    Parse the rg16 datase, returns dict of 64x9 arrays.
    '''
    # Read the raw data via pandas
    raw = pd.read_csv(file_name, sep=';')

    # Use only syllogisms, not the 4 sham questions in the beginning
    mask = [x in syllogs for x in raw['syllog']]
    raw = raw.loc[mask]

    # Sort according to  syllogism ordering
    # Create the dictionary that defines the order for sorting
    sorter_index = dict(zip(syllogisms, range(len(syllogisms))))
    # Generate a rank column that will be used to sort
    # the dataframe numerically
    raw['Tm_Rank'] = raw['syllog'].map(sorter_index)
    raw = raw.sort_values(by='Tm_Rank')

    # Get participant codes
    participants = list(raw.code.unique())
    part_answers = {}
    for part in participants:
        # Get participant data
        part_data = raw.loc[raw['code'] == part]

        # Create a 64x9 matrix for the answers, filled with zeros
        answers = np.array([[0] * 9] * 64)
        for index in range(0, 64):
            direction = part_data.iloc[index]['ConclDir']
            quantifier = part_data.iloc[index]['ConclQ']

            if quantifier == 'A':
                chosen_answer = 0
            elif quantifier == 'I':
                chosen_answer = 1
            elif quantifier == 'E':
                chosen_answer = 2
            elif quantifier == 'O':
                chosen_answer = 3
            if direction == 'CA':
                chosen_answer += 4
            elif direction == 'NVC':
                chosen_answer = 8

            # Place a 1 at the position of the answer, the rest is still zero
            answers[index, chosen_answer] = 1

        # Save matrix to dictionary
        part_answers[part] = answers
    return part_answers


def generate_syllogisms():
    '''Return all 64 syllogism labels in Khemlani (2012) ordering'''
    quantifiers = ('A', 'I', 'E', 'O')
    syllog_labels = []
    for first in quantifiers:
        for second in quantifiers:
            for figure in ('1', '2', '3', '4'):
                syllog_labels += [first + second + figure]
    return syllog_labels


if __name__ == '__main__':
    syllogisms = generate_syllogisms()

    # Parse and preprocess data
    in_file = './rg16.csv'
    individual_dicts = get_data(file_name=in_file, syllogs=syllogisms)
    print('Data imported: {}'.format(in_file))

    # Aggregate answer numbers
    aggregate = np.sum(np.array(list(individual_dicts.values())), axis=0)

    # Convert to pandas
    conclusions = ['Aac', 'Iac', 'Eac', 'Oac', 'Aca', 'Ica', 'Eca', 'Oca', 'NVC']
    aggregate_df = pd.DataFrame(aggregate, columns=conclusions, index=syllogisms)
    print('Aggregation done.')
    # Export to csv
    total_out_name = 'rg_16_agg_total.csv'
    aggregate_df.to_csv(total_out_name)

    # Convert answer numbers to answer frequencies
    aggregate_frequencies = aggregate_df / 139
    frequencies_out_name = 'rg_16_agg_frequencies.csv'
    aggregate_frequencies.to_csv(frequencies_out_name, float_format='%1.3f')
    print('New files: {}, {}'.format(total_out_name, frequencies_out_name))
