{-# OPTIONS_GHC -Wno-unrecognised-pragmas #-}
{-# HLINT ignore "Use tuple-section" #-}
module Lib
    ( parseCorpus,
    computeNGramFrequenciesPar,
    computeNGramFrequencies,
    docDelim,
    elimDots,
    tokeniseDoc,
    isSafeChar,
    prefixNodePar,
    insertTrie,
    createNGramMapPar,
    createNGramMap,
    getPrediction,
    computeNGrams,
    computeNGramsPar,
    prefixSearchSeq,
    buildTree,
    makeForest,
    listDir,
    drive,
    Trie (..)
    ) where

import qualified Data.Text as T
import qualified Data.ByteString.Char8 as B
import Data.Text.Encoding as E (decodeUtf8)
import GHC.Unicode as U ( isSpace, isAlpha, toLower )
import Control.Parallel.Strategies as S
import qualified Data.Map.Strict as M
import Data.List.Split as Split
import Data.List (sortOn)
import Control.DeepSeq as DS
import System.Directory
    ( getCurrentDirectory, getDirectoryContents, setCurrentDirectory )
import System.Directory.Internal.Prelude (getArgs)
import System.Exit(die)
import System.Environment (getProgName)
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Lazy.IO as TIO
import Data.Time

-- Data Constructors --

data Trie = Node Bool Int (M.Map Char Trie) | Empty deriving (Eq, Read)

instance Show Trie where
        show (Node bool count tmap) = "("++show count++") " ++ show bool ++ ": " ++ show tmap
        show _ = error "empty Trie"

instance NFData Trie where
        rnf Empty = ()
        rnf (Node _ _ mp) = DS.rnf mp `seq` ()

-- Types --

type Token = T.Text
type NGram = [Token]
type Line = [Token]
type Document = [Line]
type Corpus = [Document]

-- Constants --

docDelim :: T.Text
docDelim = T.pack "@@"

lineDelim :: T.Text
lineDelim = T.pack " . "

space :: T.Text
space = T.pack ""

threeDots :: T.Text
threeDots = T.pack ". . ."

fourDots :: T.Text
fourDots = T.pack ". . . ."

startDelim :: T.Text
startDelim = T.pack "<s/>"

smooth :: Int
smooth = 13

gramWeight:: Double
gramWeight = 5


-- Utility Functions --

splitInto :: Int -> [e] -> [[e]]
splitInto n xs = Split.chunksOf ((ceiling :: Double -> Int) (fromIntegral ( length xs `div` n)))  xs

delLast :: [a] -> [a]
delLast []     = error "Empty list!"
delLast [_]    = []
delLast (h:t)  = h : delLast t

listDir:: String -> IO[String]
listDir path = do
    setCurrentDirectory path
    cd <- getCurrentDirectory
    putStrLn ("Entering directory at: " ++ show cd)
    getDirectoryContents cd

lastN :: Int -> [a] -> [a]
lastN n xs = foldl (const . drop 1) xs (drop n xs)

addScores :: (Token, Double) -> (Token, Double) -> (Token, Double)
addScores (a, s1) (_, s2) = (a, (gramWeight * s1) + s2)

-- Parsing, Cleaning, and Tokenizing functions --

isSafeChar :: Char -> Bool
isSafeChar c = isAlpha c || isSpace c || c == '@' || c == '-' || c == '.'

elimDots :: T.Text -> T.Text
elimDots t = T.replace fourDots space (T.replace threeDots space t)

tokeniseDoc :: T.Text -> Document
tokeniseDoc t = map T.words (T.splitOn lineDelim t)

parseCorpus:: B.ByteString -> Corpus
parseCorpus file = do
        let cleaned_corpus = decodeUtf8 (B.map toLower file)
        let docs = T.splitOn docDelim cleaned_corpus
        let text_docs = map elimDots docs
        S.parMap S.rdeepseq tokeniseDoc text_docs

-- Functions to generate Ngrams --

computeNGramsPar :: Int -> Corpus -> [NGram]
computeNGramsPar n corpus = let preppedDocs = map (prepareforNGram n) corpus in  concat (concatMap (S.parMap S.rdeepseq (ngram n)) preppedDocs)

computeNGrams :: Int -> Corpus -> [NGram]
computeNGrams n corpus = let preppedDocs = map (prepareforNGram n) corpus in  concat (concatMap (map (ngram n)) preppedDocs)

prepareforNGram:: Int -> Document -> Document
prepareforNGram n = map (replicate n startDelim ++ )

ngram :: Int -> Line -> [NGram]
ngram n xs
  | n <= length xs = take n xs : ngram n (drop 1 xs)
  | otherwise = [xs]

-- Functions to generate maps of Ngram frequencies --

makeMaps :: Int -> [Corpus] -> [M.Map NGram Int]
makeMaps n  = S.parMap S.rdeepseq (createNGramMapPar n)

searchMaps :: NGram -> [M.Map NGram Int] -> Int
searchMaps gram maps = foldl (+) smooth (S.parMap S.rdeepseq (M.findWithDefault 0 gram) maps)

computeNGramFrequencies :: Int -> Corpus -> M.Map NGram Int
computeNGramFrequencies n corp = let tupCounts = map (\x -> (x, 1::Int)) (computeNGrams n corp)
                                     histogram = M.toList $ M.fromListWith (+) tupCounts
                                     in M.fromList histogram

computeNGramFrequenciesPar :: Int -> Int -> Corpus -> M.Map NGram Int
computeNGramFrequenciesPar nochunks n corp = let chunks = splitInto nochunks corp in foldl (M.unionWith (+)) M.empty (S.parMap S.rdeepseq (computeNGramFrequencies n) chunks)

createNGramMapPar :: Int -> Corpus -> M.Map NGram Int
createNGramMapPar n corp = let mps = S.parMap S.rdeepseq (flip (computeNGramFrequenciesPar 8) corp) [1..n] in foldl (M.unionWith (+)) M.empty mps

createNGramMap :: Int -> Corpus -> M.Map NGram Int
createNGramMap n corp = let mps = map (`computeNGramFrequencies` corp) [1..n] in foldl (M.unionWith (+)) M.empty mps

-- Functions for Tries --

makeForest :: [String] -> IO [Trie]
makeForest filteredDirs = do
                        let docs = map TIO.readFile filteredDirs
                        let tok = TL.splitOn (TL.pack " ")
                        let fn =  (return::(a -> IO a)) (tok.TL.toLower) -- lowercase
                        let ws = map (fn <*>) docs
                        sequence (S.parMap S.rpar (return buildTreeT <*>) ws)

generateMatches ::  String -> [Trie] ->  [(String,Int)]
generateMatches sent tries = concat $ S.parMap S.rdeepseq (prefixNodePar 0 sent "") tries

insertTrieT :: TL.Text -> Trie -> Trie
insertTrieT _ Empty = error "insert into empty Trie"
insertTrieT txt (Node bool count tmap)
        | txt == TL.empty = Node True (count + 1) tmap
        | M.member x tmap = let map' = M.insert x (insertTrieT xs (tmap M.! x)) tmap in (Node bool count map')
        | otherwise = let map' = M.insert x (insertTrieT xs (Node False 0 M.empty)) tmap in (Node bool count map')
        where
                x = TL.head txt
                xs = TL.tail txt

insertTrie :: String -> Trie -> Trie
insertTrie _ Empty = error "insert into empty Trie"
insertTrie (x:xs) (Node bool count tmap)
        | M.member x tmap = let map' = M.insert x (insertTrie xs (tmap M.! x)) tmap in (Node bool count map')
        | otherwise = let map' = M.insert x (insertTrie xs (Node False 0 M.empty)) tmap in (Node bool count map')
insertTrie [] (Node _ count tmap) = Node True (count + 1) tmap

prefixNodePar :: Int -> String -> String -> Trie -> [(String, Int)]
prefixNodePar depth [] scat (Node isEnd count children) = prefixSearchPar depth (scat, (Node isEnd count children))
prefixNodePar depth (x:xs) scat (Node _ _ children)
        | M.member x children = prefixNodePar depth xs (x:scat) (children M.! x)
        | otherwise = []
prefixNodePar _ _ _ _ = error "args"


prefixSearchSeq::(String, Trie) -> [(String,Int)]
prefixSearchSeq (scat, (Node isEnd count children)) = let childList = zipWith (\a b -> ((fst a):b, snd a) ) (M.toList children) (replicate (length children) scat)
                                                          curr = if isEnd then scat else ""
                                                          in (curr, count):concatMap prefixSearchSeq childList
prefixSearchSeq _ = error "args"

prefixSearchPar:: Int -> (String, Trie) -> [(String,Int)]
prefixSearchPar depth (scat, Node isEnd count children)
        | depth > 0 = let childList = zipWith (\a b -> (fst a:b, snd a) ) (M.toList children) (replicate (length children) scat)
                          curr = if isEnd then scat else ""
                          in (curr, count): (concat $ S.parMap S.rseq (prefixSearchPar (depth - 1)) childList)
        | otherwise = prefixSearchSeq (scat, (Node isEnd count children))
prefixSearchPar _ _ = error "empty"

buildTree :: [String] -> Trie
buildTree = foldl (flip insertTrie) (Node True 0 M.empty)

buildTreeT :: [TL.Text] -> Trie
buildTreeT = foldl (flip insertTrieT) (Node True 0 M.empty)


-- Functions to generate predictions --


getPrediction :: String -> [M.Map NGram Int] -> [Trie] -> String
getPrediction [] _ _ = "Error: invalid input"
getPrediction sent mps tries = fst $ last' (sortOn snd scores) where
        sentence = map T.pack (words sent)
        curNgram = delLast sentence
        lst = last sentence
        guesses = map (\x -> (T.pack $ reverse $ fst x, snd x)) (generateMatches (T.unpack lst) tries)
        scores = S.parMap S.rdeepseq (\x -> (T.unpack $ fst x, snd x)) (map (getScore curNgram mps) guesses)

last' :: [(String, Double)] -> (String, Double)
last' [] = ("Error: Unable to complete word", 0.0)
last' [x] = x
last' (_:xs) = last' xs


getScore :: NGram -> [M.Map NGram Int] -> (Token, Int) -> (Token, Double)
getScore curNgram mps token
        | length curNgram > 2 = let twoGramScore = score (lastN 2 curNgram) mps token
                                    nGramScore =  score curNgram mps token
                                    in addScores nGramScore twoGramScore
        | otherwise = score curNgram mps token

score :: NGram -> [M.Map NGram Int] -> (Token, Int) -> (Token, Double)
score curNgram mps (guess, _) = let freq = searchMaps (curNgram ++[guess]) mps in (guess, fromIntegral freq / fromIntegral (searchMaps curNgram mps))

-- Driver Functions --


makeCorpi :: [String] -> IO [Corpus]
makeCorpi filteredDirs = do
                        let docs = map B.readFile filteredDirs
                        let fn = (return::(a -> IO a)) parseCorpus
                        -- let fn =  (return::(a -> IO a)) (words.(map toLower.filter isSafeChar))
                        let corpi = S.parMap S.rpar (fn <*>) docs
                        sequence corpi

drive :: Int -> IO ()
drive n  = do
        args <- getArgs
        dirName <- case args of
                [fn] -> return fn
                _ -> do pn <- getProgName
                        die $ "Usage: "++pn++" <corpus-directory>"

        dirs <- listDir dirName -- directory of files
        let filteredDirs = filter (\x -> x /= "." && x /= "..") dirs
        start <- getCurrentTime
        tries <- makeForest filteredDirs
        corpi <- makeCorpi filteredDirs

        let maps = makeMaps n corpi
        let prediction = getPrediction "he was y" maps tries -- dummy prediction to force language model to be built fully
        end <- prediction `deepseq` getCurrentTime

        putStrLn ("language model built successfully in " ++ show (diffUTCTime end start))

        _ <- printPredictions maps tries
        return ()

printPredictions :: [M.Map NGram Int] -> [Trie] -> IO b
printPredictions maps tries = do
        putStrLn "enter a prefix: "
        prefix <- getLine
        -- putStrLn ("prediction for: " ++ prefix)
        let prediction = getPrediction prefix maps tries
        putStrLn prediction
        printPredictions maps tries
