module WordEmbSeq
   (
      cooccurence,
      cooccurenceMatrix
   )
   where

import qualified Data.Text as T
import qualified Data.Vector as V
import qualified Data.Map as M
import Data.Maybe(fromMaybe)

-- create cooccurence sparse matrix 
cooccurenceMatrix :: [[T.Text]] -> M.Map T.Text Int -> Int -> M.Map (Int, Int) Int
cooccurenceMatrix corpus vocab window = foldr (M.unionWith (+)) M.empty cooccurences
   where cooccurences = map (\line -> cooccurence (V.fromList line) vocab window) corpus

-- cooccurence(sentence, vocab, window_size) -> cooccurence matrix
cooccurence :: V.Vector T.Text -> M.Map T.Text Int -> Int -> M.Map (Int, Int) Int
cooccurence sentence vocab window =
  foldr (M.unionWith (+)) M.empty $ map (_cooccurence sentence vocab window) [0..V.length sentence -1]

_cooccurence :: V.Vector T.Text -> M.Map T.Text Int -> Int -> Int -> M.Map (Int, Int) Int
_cooccurence sentence vocab window i = M.fromListWith (+) $ V.toList $ left_cooccur V.++ right_cooccur
   where center_word                   = sentence V.! i
         (left_context, right_context) = _context sentence window i
         left_cooccur                  = V.map (\left -> ((vocab M.! center_word, vocab M.! left), 1)) left_context
         right_cooccur                 = V.map (\right -> ((vocab M.! center_word, vocab M.! right), 1)) right_context

_context :: V.Vector T.Text -> Int -> Int -> (V.Vector T.Text, V.Vector T.Text)
_context sentence window i = (V.slice left_start left_len sentence, V.slice (i + 1) right_len sentence)
   where left_start = max 0 (i - window)
         left_len   = i - left_start
         right_end  = min (i + window) (V.length sentence - 1)
         right_len  = right_end - i

toWordsMatrix :: M.Map (Int, Int) Int -> M.Map T.Text Int -> M.Map (T.Text, T.Text) Int
toWordsMatrix cm vocab = M.fromList $ map (\(k,v) -> ((toWords M.! (fst k), toWords M.! (snd k)), v)) $ M.toList cm
   where toWords = revertMap vocab
         revertMap m = M.fromList $ map (\(x,y) -> (y,x)) $ M.toList m
