"""
Configuration for reproducing Mwata-Velu et al. (2023)
"EEG-BCI Features Discrimination between Executed and Imagined Movements
 Based on FastICA, Hjorth Parameters, and SVM"
Mathematics 2023, 11, 4409. DOI: 10.3390/math11214409

Dataset: PhysioNet EEG Motor Movement/Imagery Dataset (curated CSV format)
"""

from pathlib import Path

# =============================================================================
# Paths
# =============================================================================
DATA_DIR = Path("..\eegmmidb")

# =============================================================================
# Dataset parameters
# =============================================================================
SAMPLING_RATE = 160  # Hz
N_CHANNELS = 64

# Full 64-channel names (Sharbrough system, PhysioNet ordering)
CHANNEL_NAMES = [
    'FC5', 'FC3', 'FC1', 'FCz', 'FC2', 'FC4', 'FC6',
    'C5', 'C3', 'C1', 'Cz', 'C2', 'C4', 'C6',
    'CP5', 'CP3', 'CP1', 'CPz', 'CP2', 'CP4', 'CP6',
    'Fp1', 'Fpz', 'Fp2',
    'AF7', 'AF3', 'AFz', 'AF4', 'AF8',
    'F7', 'F5', 'F3', 'F1', 'Fz', 'F2', 'F4', 'F6', 'F8',
    'FT7', 'FT8',
    'T7', 'T8', 'T9', 'T10',
    'TP7', 'TP8',
    'P7', 'P5', 'P3', 'P1', 'Pz', 'P2', 'P4', 'P6', 'P8',
    'PO7', 'PO3', 'POz', 'PO4', 'PO8',
    'O1', 'Oz', 'O2',
    'Iz',
]

# =============================================================================
# Channel selections (Section 3.2)
# =============================================================================

# 19 channels from the 10-20 system used for ICA decomposition (Section 3.2)
ICA_CHANNELS = [
    'Fp1', 'Fp2', 'F7', 'F3', 'Fz', 'F4', 'F8',
    'T7', 'C3', 'Cz', 'C4', 'T8',
    'P7', 'P3', 'Pz', 'P4', 'P8', 'O1', 'O2',
]

# 9 "Selected_channels" for ICA energy concentration criterion (Algorithm 1, Step 7)
# These are the sensorimotor + frontal + parietal channels the paper evaluates
# energy concentration against.
SELECTED_CHANNELS = ['C3', 'Cz', 'C4', 'F3', 'Fz', 'F4', 'P3', 'Pz', 'P4']

# Channels used for Hjorth feature extraction (Section 3.5, Table 5)
# The paper's best results (Set 3) use C3, Cz, C4.
TARGET_CHANNELS = ['C3', 'Cz', 'C4']

# =============================================================================
# Task / run definitions
# =============================================================================
# NOTE ON NUMBERING: The curated CSV dataset uses a different run numbering
# scheme than the original PhysioNet EDF files. The mapping is:
#
#   Curated CSV    PhysioNet EDF    Task
#   -----------    -------------    ----
#   Run 01         R03              Execute open/close left or right fist
#   Run 02         R04              Imagine open/close left or right fist
#   Run 03         R05              Execute open/close both fists or both feet
#   Run 04         R06              Imagine open/close both fists or both feet
#   Run 05         R07              Execute open/close left or right fist
#   Run 06         R08              Imagine open/close left or right fist
#   Run 07         R09              Execute open/close both fists or both feet
#   Run 08         R10              Imagine open/close both fists or both feet
#   Run 09         R11              Execute open/close left or right fist
#   Run 10         R12              Imagine open/close left or right fist
#   Run 11         R13              Execute open/close both fists or both feet
#   Run 12         R14              Imagine open/close both fists or both feet
#
# The paper's Section 4 states twice that results correspond to R03, R04, R07, 
# R08, R11, R12 (left/right fist only). This agrees with another statement 
# that says they only use 6 of the 14 runs per subject. However, the sample 
# counts (8652 total) require including all 12 task runs. Additionally, the 
# paper also says "samples of the first 10 runs constituted the training set; 
# those of the 11th and 12th, and 13th and 14th runs were used as the testing 
# and validation sets, respectively". These statements contradict each other. 
# We use the 6 runs that are listed twice: R03, R04, R07, R08, R11, R12.

EXECUTION_RUNS = [1, 5, 9]   # R03, R07, R11
IMAGERY_RUNS = [2, 6, 10]     # R04, R08, R12
TARGET_RUNS = EXECUTION_RUNS + IMAGERY_RUNS

# Annotation labels that correspond to T1/T2 events (active task periods).
# T0 (rest) is excluded. These codes come from the curated CSV annotation files.
ACTIVE_EVENT_LABELS = [2, 3, 5, 6, 8, 9, 11, 12]

# =============================================================================
# Sub-band definitions (Section 3.3)
# =============================================================================
SUB_BANDS = [
    ('theta', 4.0, 8.0),
    ('alpha', 8.0, 13.0),
    ('beta', 13.0, 30.0),
]

# =============================================================================
# ICA parameters (Section 3.4, Algorithm 1)
# =============================================================================
ICA_N_COMPONENTS = 19
ICA_ENERGY_THRESHOLD = 0.35
ICA_MAX_ITER = 500
ICA_TOL = 1e-4

# =============================================================================
# SVM parameters (Section 3.6, Figure 6)
# =============================================================================
SVM_C = 2 ** 13       # 8192
SVM_GAMMA = 2 ** 1    # 2
SVM_KERNEL = 'rbf'

# =============================================================================
# Evaluation
# =============================================================================
N_RUNS = 5  # Paper: "results were averaged by running the model five times"
RANDOM_SEEDS = [42, 123, 456, 789, 1024]

# =============================================================================
# ICA strategy (not specified in paper — this is a reproducibility variable)
# =============================================================================
# Options:
#   'per_run'     — Fit ICA independently on each ~2-minute run
#   'per_subject' — Fit ICA once on all runs concatenated per subject
#   'global'      — Fit ICA once on all training subjects concatenated
ICA_STRATEGY = 'per_subject'

# =============================================================================
# Method 2: Cross-Subject Split (Table 4)
# =============================================================================
# Note: The curated dataset already excludes the 6 problematic subjects
# (S088, S089, S092, S100, S104, S106), so we use consecutive IDs.
TRAIN_SUBJECTS = list(range(1, 84))      # Subjects 1-83
TEST_SUBJECTS = list(range(84, 94))       # Subjects 84-93
VAL_SUBJECTS = list(range(94, 104))       # Subjects 94-103