/* Handles 23andMe, Strategene (maybe all diagnomics files).  TBD Self Decode */

import {TRACE_INGEST} from "../Constants";

const INGEST_VERSION_NO = "v0.1"

const MAX_LINES_TO_SEARCH_FOR_HEADERS = 30

const stripGsaPrefixSelfDecode = (token) => token.replace(/^GSA-rs/, "rs");

const stripQuoteTMG = (token) => token.replace('"', '');

const GENOME_FILE_STRATEGIES = [
    {name: "23andMe",
        first_line_pattern: /^# This data file generated by 23andMe/,   // TODO DPB merge and make first_line_pattern a disjunction
        headers_line_pattern: /^# rsid\t/,
        column_separator: "\t",
        expected_tokens_per_row: 4,
        row_ingester: (tokens) => [/*rsId*/ tokens[0], /*genome pair*/ tokens[3], /*errors*/undefined]
    },
    {name: "Strategene or Ixxx (Diagnomics)",
        first_line_pattern: /^#This file was generated by Diagnomics/,
        headers_line_pattern: /^rsID\t/,
        column_separator: "\t",
        expected_tokens_per_row: 4,
        row_ingester: (tokens) => [/*rsId*/ tokens[0], /*genome pair*/ tokens[3], /*errors*/undefined]
    },
    {name: "Ancestry",
        first_line_pattern: /^#AncestryDNA raw data download/,
        headers_line_pattern: /rsid\tchromosome\tposition\tallele1\tallele2/,
        column_separator: "\t",
        expected_tokens_per_row: 5,
        row_ingester: (tokens) => [/*rsId*/ tokens[0], /*genome pair*/ tokens[3] + tokens[4], /*errors*/undefined]
    },
    {name: "SelfDecode2023",
        first_line_pattern: /^# Pipeline Version: 2.00/,  // TODO DPB merge and make first_line_pattern a disjunction
        second_line_pattern: /^# Source: SelfDecode/,  // TODO DPB broken with XX on purpose for one-off test
        headers_line_pattern: /rsid\tchromosome\tposition\tgenotype/,
        column_separator: "\t",
        expected_tokens_per_row: 4,
        row_ingester: (tokens) => [/*rsId*/ stripGsaPrefixSelfDecode(tokens[0]), /*genome pair*/ tokens[3], /*errors*/undefined]
        // NOTE: some rsIds are prefixed with "gsa-"
    },
    {name: "SelfDecode",
        first_line_pattern: /^# Generated by SelfDecode/,  // TODO DPB merge and make first_line_pattern a disjunction
        headers_line_pattern: /rsid\tchromosome\tposition\tgenotype/,
        column_separator: "\t",
        expected_tokens_per_row: 4,
        row_ingester: (tokens) => [/*rsId*/ stripGsaPrefixSelfDecode(tokens[0]), /*genome pair*/ tokens[3], /*errors*/undefined]
        // NOTE: some rsIds are prefixed with "gsa-"
    },
    {name: "PLINK",
        first_line_pattern: /^# This data file generated by PLINK/,
        headers_line_pattern: /rsid\tchromosome\tposition\tgenotype/,
        column_separator: "\t",
        expected_tokens_per_row: 4,
        row_ingester: (tokens) => [/*rsId*/ stripGsaPrefixSelfDecode(tokens[0]), /*genome pair*/ tokens[3], /*errors*/undefined]
        // NOTE: some rsIds are prefixed with "gsa-"
    },
    {name: "QUOTED_WSV", // TellMeGen (and maybe others) that puts quotes around the whole line
        first_line_pattern: /^"# rsid\tchromosome\tposition\tgenotype"$/, // headers line will be the first line
        headers_line_pattern: /^"# rsid\tchromosome\tposition\tgenotype"$/,
        // regex for whitespace
        column_separator: /[\t\s]+/,
        expected_tokens_per_row: 4,
        row_ingester: (tokens) => [/*rsId*/ stripGsaPrefixSelfDecode(stripQuoteTMG(tokens[0])), /*genome pair*/ stripQuoteTMG(tokens[3]), /*errors*/undefined]
        // NOTE: some rsIds are prefixed with "gsa-"
    },
    {name: "MyHappyGenes",
        first_line_pattern: /^# MyHappyGenes \[TD\]$/,
        headers_line_pattern: /^SNP Name\tChr\tPosition\tAllele1 - Forward\tAllele2 - Forward$/,
        column_separator: "\t",
        expected_tokens_per_row: 5,
        row_ingester: (tokens) => [/*rsId*/ tokens[0], /*genome pair*/ tokens[3] + tokens[4], /*errors*/undefined]
    }
]

function ingestGenomeData(genomeDataFileString, traceToConsole = TRACE_INGEST) {
    let commentLines = []
    let errors = []
    let warnings = []

    //const csvHeader = string.slice(0, string.indexOf("\n")).split("\t");
    // TODO DPB maybe 635,000 rows, so reconsider cost of storing the split vs. parse-as-you-go (computers are more than they were)
    // For that, could split while reading or even parse while reading or even use the data and not keep around at all while reading
    const tsvRows = genomeDataFileString.split("\n");

    const tsvRowsSize = tsvRows.length;

    if (tsvRowsSize < 2) {
        return [undefined, ["Not a recognized genome data file - only zero or one lines of data ( " + INGEST_VERSION_NO + ")"], []]
    }

    // Skip lines to the first one that starts with "# rsid" and treat that as the header
    let firstLine = tsvRows[0].trim()
    commentLines.push(firstLine)

    const strategy = GENOME_FILE_STRATEGIES.find((tryMe) =>
        tryMe.first_line_pattern.test(firstLine) &&
        (!tryMe["second_line_pattern"] || tryMe.second_line_pattern.test(tsvRows[1].trim()))
    )

    function truncateAndEscapeSpecialCharacters(aString) {
        if (aString.length > 100) {
            aString = aString.substring(0, 100) + "..."
        }
        // row2 where tab is \t and newline is \n and backslash is \\
        aString = aString.replace(/\\/g, "\\\\").replace(/\t/g, "\\t").replace(/\n/g, "\\n")
        // with special characters escaped as \x00NN where NN is the hex code for the character
        return aString.replace(/[\x00-\x1F\x7F-\xFF]/g, function (c) {
            return "\\x00" + (c.charCodeAt(0).toString(16)).slice(-2);
        });
    }

    if (!strategy) {
        let error = "Not a recognized genome data file - failed to identify first line in file. (Line '" + truncateAndEscapeSpecialCharacters(firstLine) + "'; " + INGEST_VERSION_NO + ")"
        console.log(error)
        let errors = [error]
        if (tsvRowsSize > 1) {
            let error2 = "(Second Line '" + truncateAndEscapeSpecialCharacters(tsvRows[1]) + "')"
            console.log(error2)
            errors.push(error2)
            if (tsvRowsSize > 2) {
                let error3 = "(Third Line '" + truncateAndEscapeSpecialCharacters(tsvRows[2]) + "')"
                console.log(error3)
                errors.push(error3)
            }
        }
        return [undefined, errors, []]
    }

    let nextLineNumber = 0
    let headerLine = undefined
    let headerLineCandidate = tsvRows[nextLineNumber++].trim()
    while (!headerLine && nextLineNumber < MAX_LINES_TO_SEARCH_FOR_HEADERS && nextLineNumber < tsvRowsSize) {
        if (strategy.headers_line_pattern.test(headerLineCandidate)) {
            headerLine = headerLineCandidate
        } else {
            commentLines.push(headerLineCandidate)
            headerLineCandidate = tsvRows[nextLineNumber++].trim()
        }
    }
    if (!headerLine) {
        let errors = ["Not a recognized genome data file - failed to find column headers line within the first " + MAX_LINES_TO_SEARCH_FOR_HEADERS + " lines.",
            "Support info... INGEST_VERSION_NO=" + INGEST_VERSION_NO + "; Strategy: " + strategy.name,
            "(First line: '" + truncateAndEscapeSpecialCharacters(firstLine) + "')"]
        console.log(errors[0])
        if (tsvRowsSize > 1) {
            let error2 = "(Second Line '" + truncateAndEscapeSpecialCharacters(tsvRows[1]) + "')"
            console.log(error2)
            errors.push(error2)
            if (tsvRowsSize > 2) {
                let error3 = "(Third Line '" + truncateAndEscapeSpecialCharacters(tsvRows[2]) + "')"
                console.log(error3)
                errors.push(error3)
            }
        }
        console.log(commentLines)
        return [undefined, errors, []]
    }
    // TODO DPB if file started with a bunch of lines starting with hash mark, then take the first line with rsNNNN as data
    /*if (commentLine !== "# rsid\tchromosome\tposition\tgenotype" && commentLine !== "rsID\tCHROM\tPOS\tGENOTYPE" && commentLine !== "rsID CHROM POS GENOTYPE") {
        let warn = "Not a recognized genome data file -- failed to match column headers line, trying anyway. (First line " + firstLine + ";" + VERSION_NO + ")"
        console.log(warn)
        warnings.push(warn)
    }*/

    let genomeDict = {}
    let wrongNumberOfTokensRowsCount = 0
    let firstWrongNumberOfTokensRow = -1
    let duplicateRsIdsCount = 0
    let duplicateRsIds = []
    let ignoredGenomes = {} // a string keying a count

    // TODO DPB handle last row (or empty or starts-with-hash rows)

    while (nextLineNumber < tsvRowsSize) {
        let dataLine = tsvRows[nextLineNumber++].trim()
        let tokens = dataLine.split(strategy.column_separator)
        if (tokens.length === strategy.expected_tokens_per_row) {

            let [rsIdRaw, genotype, errors] = strategy.row_ingester(tokens)
            let rsId = rsIdRaw.trim().toLowerCase() // TODO DPB lower case might not be correct action for non rsNNN rows
            if (genomeDict[rsId]) {
                duplicateRsIdsCount++
                duplicateRsIds.push(rsId)
            } else {
                let rsIdGenotype = genotype.trim().toUpperCase()
                // record other than [ACGTDI][ACGTDI] into ignoredGenomes
                if (/^[ACGTDI]{1,2}$/.test(rsIdGenotype)) {
                    genomeDict[rsId] = rsIdGenotype
                } else {
                    let newCount = ignoredGenomes[rsIdGenotype]
                    ignoredGenomes[rsIdGenotype] = newCount ? newCount + 1 : 1
                }
            }

        } else {
            if (nextLineNumber !== tsvRows.length || tokens.length !== 1 || tokens[0] !== "") {
                console.log("Line with unusual tokens count...")
                console.log(tokens)
                if (wrongNumberOfTokensRowsCount === 0) {
                    firstWrongNumberOfTokensRow = nextLineNumber - 1
                }
                wrongNumberOfTokensRowsCount++
            }
        }
    }
    if (wrongNumberOfTokensRowsCount > 0) {
        let warn = "Data file had " + wrongNumberOfTokensRowsCount.toString() + " wrong-number-of-tabs rows.  Ignoring that row and proceeding anyway (first such line " + firstWrongNumberOfTokensRow + "; " + INGEST_VERSION_NO + ".)"
        warnings.push(warn)
    }
    if (duplicateRsIdsCount > 0) {
        let warn = "Data file had " + duplicateRsIdsCount.toString() + " duplicate-rsId rows. Ignoring those rows and proceeding anyway " + INGEST_VERSION_NO
        console.log(warn)
        console.log(duplicateRsIds)
        warnings.push(warn)
    }
    if (ignoredGenomes) {
        let warn = "Data file has SNP encodings that are not pairs of A, C, G, T, I, D.  An example is " + Object.keys(ignoredGenomes)[0]
        console.log(warn)
        console.log(ignoredGenomes)
        warnings.push(warn)
    }
    if (traceToConsole) {
        let keys = Object.keys(genomeDict)
        console.log("Count of genes in data: " + keys.length)
        console.log(keys[0])
        console.log(genomeDict[keys[0]])
        let oneRsId = "rs7775228"
        console.log(oneRsId)
        console.log(genomeDict[oneRsId])
    }
    if (warnings.length > 0) {
        warnings.push("Support info. ; INGEST_VERSION_NO=" + INGEST_VERSION_NO + "; strategy: " + strategy.name + "; first line: " + firstLine)
    }
    return [genomeDict, errors, warnings]
}

export default ingestGenomeData;