module BuildKnowledgeBase
  ( buildKnowledgeBase,
    processSentence,
  )
where

import Data.Char (isLetter, isNumber, toLower)
import Data.List (tails)
import Data.List.Split (splitOneOf, chunksOf)
import qualified Data.Map.Lazy as M
import Control.Parallel.Strategies (
  parMap, rdeepseq, rpar, parList, rseq, using, parListChunk, parBuffer)
import Control.DeepSeq (deepseq, force)

isStringNumber :: String -> Bool
isStringNumber = foldr ((&&) . isNumber) False

getFreqs :: (Ord a) => [a] -> [(a, Int)]
getFreqs xs = M.toList (M.fromListWith (+) [(x, 1) | x <- xs])

processSentence :: String -> [String]
processSentence sentence = toks'''
  where
    -- split into words
    toks = splitOneOf " .,;:!?()[]{}<>-/\"\\\n" sentence
    -- make lowercase and keep only letters and numbers
    toks' = map (map toLower . filter (\c -> isLetter c || isNumber c)) toks
    -- convert each number to "[NUM]"
    toks'' = map (\t -> if isStringNumber t then "[NUM]" else t) toks'
    -- remove empty strings
    toks''' = filter (not . null) toks''

getNGrams :: Int -> [String] -> [[String]]
getNGrams n toks = filter (\ts -> length ts == n) $ map (take n) . tails $ toks

getNGramFreqs:: Int -> [[String]] -> [([String], Int)]
getNGramFreqs n sentences = getFreqs $ concatMap (getNGrams n) sentences

writeNGramFreqs :: [([String], Int)] -> String -> IO ()
writeNGramFreqs freqs filename = writeFile filename $ unlines lines
  where
    lines = map (\(ts, c) -> show c ++ " " ++ unwords ts) freqs

buildKnowledgeBase :: String -> String -> IO ()
buildKnowledgeBase inputFile outputDir = do
  input <- readFile inputFile
  let sentences = splitOneOf ".?!" input
      sentences' = map processSentence sentences
      sentences'' = map (take 200) $ chunksOf 200 sentences'
      ngram_freqs n = concat (
        map (getNGramFreqs n) sentences'' `using` parList rdeepseq)
      freqs = map ngram_freqs [1..3]

  mapM_ (\(n, f) -> writeNGramFreqs f (outputDir ++ "/" ++ show n ++ ".txt")) $
   zip [1..] freqs
