module Utils
   (
      lineTokenize,
      vocabulary
   )
   where

import qualified Data.Text as T
import qualified Data.Set as S
import qualified Data.Map as M
import qualified Data.Text.IO as TIO
import Data.Char ( isSpace, isAlpha, toLower )

validToken :: T.Text -> Bool
validToken = T.all isAlpha

cleanTokens :: [T.Text] -> [T.Text]
cleanTokens = filter validToken . map T.toLower

wrapStartEnd :: [T.Text] -> [T.Text]
wrapStartEnd seq = [T.pack "<START>"] ++ seq ++ [T.pack "<END>"]

-- Drop empty sequences
validSequence :: [T.Text] -> Bool
validSequence = not . null

-- Seperate text into list of list of words
lineTokenize :: T.Text -> [[T.Text]]
lineTokenize contents = map wrapStartEnd $ filter validSequence $ map cleanTokens $ map T.words $ T.lines contents

-- Output all words into a map vocabulary
vocabulary :: [[T.Text]] -> M.Map T.Text Int
vocabulary corpus = M.fromList $ zip (S.toAscList . S.fromList $ concat corpus) [0..]
