src/Gargantext/Core/Text/Context.hs

   1 {-|
   2 Module      : Gargantext.Core.Text.Context
   3 Description : How to manage contexts of texts ?
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Context of text management tool, here are logic of main types:
  11
  12 - Term
  13 - Multi-term
  14 - Label
  15 - Sentence
  16 - Corpus
  17
  18 How to split contexts is describes in this module.
  19
  20 -}
  21
  22
  23 module Gargantext.Core.Text.Context
  24   where
  25
  26 import Data.Text (Text, pack, unpack)
  27 import Data.String (IsString)
  28
  29 import Text.HTML.TagSoup (parseTags, isTagText, Tag(..))
  30 import Gargantext.Core.Text
  31 import Gargantext.Prelude hiding (length)
  32
  33 ------------------------------------------------------------------------
  34 type Term = Text
  35 type MultiTerm = [Term]
  36 type Label = MultiTerm
  37
  38 type TermList = [(Label, [MultiTerm])]
  39
  40 type Sentence  a = [a] -- or a nominal group
  41 type Corpus    a = [Sentence a] -- a list of sentences
  42
  43 -- type ConText a = [Sentence a]
  44 -- type Corpus a = [ConText a]
  45 ------------------------------------------------------------------------
  46
  47 -- | Contexts definition to build/unbuild contexts.
  48 data SplitContext = Chars Int | Sentences Int | Paragraphs Int
  49
  50 -- | splitBy contexts of Chars or Sentences or Paragraphs
  51 -- To see some examples at a higher level (sentences and paragraph), see
  52 -- 'Gargantext.Core.Text.Examples.ex_terms'
  53 --
  54 -- >>> splitBy (Chars 0) (pack "abcde")
  55 -- ["a","b","c","d","e"]
  56 --
  57 -- >>> splitBy (Chars 1) (pack "abcde")
  58 -- ["ab","bc","cd","de"]
  59 --
  60 -- >>> splitBy (Chars 2) (pack "abcde")
  61 -- ["abc","bcd","cde"]
  62 splitBy :: SplitContext -> Text -> [Text]
  63 splitBy (Chars     n)  = map pack        . chunkAlong (n+1) 1 . unpack
  64 splitBy (Sentences n)  = map unsentences . chunkAlong (n+1) 1 . sentences
  65 splitBy (Paragraphs _) = map unTag       . filter isTagText   . parseTags
  66   where
  67     unTag :: IsString p => Tag p -> p
  68     unTag (TagText x) = x
  69     unTag _           = ""
  70