{-|
Module      : Gargantext.Core.Text.Context
Description : How to manage contexts of texts ?
Copyright   : (c) CNRS, 2017-Present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

Context of text management tool, here are logic of main types:

- Term
- Multi-term
- Label
- Sentence
- Corpus

How to split contexts is describes in this module.

-}


module Gargantext.Core.Text.Context
  where

import Data.Text (Text, pack, unpack)
import Data.String (IsString)

import Text.HTML.TagSoup (parseTags, isTagText, Tag(..))
import Gargantext.Core.Text
import Gargantext.Prelude hiding (length)

------------------------------------------------------------------------
type Term = Text
type MultiTerm = [Term]
type Label = MultiTerm

type TermList = [(Label, [MultiTerm])]

type Sentence  a = [a] -- or a nominal group
type Corpus    a = [Sentence a] -- a list of sentences

-- type ConText a = [Sentence a]
-- type Corpus a = [ConText a]
------------------------------------------------------------------------

-- | Contexts definition to build/unbuild contexts.
data SplitContext = Chars Int | Sentences Int | Paragraphs Int

-- | splitBy contexts of Chars or Sentences or Paragraphs
-- To see some examples at a higher level (sentences and paragraph), see
-- 'Gargantext.Core.Text.Examples.ex_terms'
--
-- >>> splitBy (Chars 0) (pack "abcde")
-- ["a","b","c","d","e"]
--
-- >>> splitBy (Chars 1) (pack "abcde")
-- ["ab","bc","cd","de"]
--
-- >>> splitBy (Chars 2) (pack "abcde")
-- ["abc","bcd","cde"]
splitBy :: SplitContext -> Text -> [Text]
splitBy (Chars     n)  = map pack        . chunkAlong (n+1) 1 . unpack
splitBy (Sentences n)  = map unsentences . chunkAlong (n+1) 1 . sentences
splitBy (Paragraphs _) = map unTag       . filter isTagText   . parseTags
  where
    unTag :: IsString p => Tag p -> p
    unTag (TagText x) = x
    unTag _           = ""