src/Gargantext/Text/Context.hs

   1 {-|
   2 Module      : Gargantext.Text.Context
   3 Description : How to manage contexts of texts ?
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Context of text management tool, here are logic of main types:
  11
  12 - Term
  13 - Multi-term
  14 - Label
  15 - Sentence
  16 - Corpus
  17
  18 How to split contexts is describes in this module.
  19
  20 -}
  21
  22 {-# LANGUAGE NoImplicitPrelude #-}
  23 {-# LANGUAGE OverloadedStrings #-}
  24
  25 module Gargantext.Text.Context
  26   where
  27
  28 import Data.Text (Text, pack, unpack)
  29 import Data.String (IsString)
  30
  31 import Text.HTML.TagSoup (parseTags, isTagText, Tag(..))
  32 import Gargantext.Text
  33 import Gargantext.Prelude hiding (length)
  34
  35 ------------------------------------------------------------------------
  36 type Term = Text
  37 type MultiTerm = [Term]
  38 type Label = MultiTerm
  39
  40 type TermList = [(Label, [MultiTerm])]
  41
  42 type Sentence  a = [a] -- or a nominal group
  43 type Corpus    a = [Sentence a] -- a list of sentences
  44
  45 -- type ConText a = [Sentence a]
  46 -- type Corpus a = [ConText a]
  47 ------------------------------------------------------------------------
  48
  49 -- | Contexts definition to build/unbuild contexts.
  50 data SplitContext = Chars Int | Sentences Int | Paragraphs Int
  51
  52 -- | splitBy contexts of Chars or Sentences or Paragraphs
  53 -- To see some examples at a higher level (sentences and paragraph), see
  54 -- 'Gargantext.Text.Examples.ex_terms'
  55 --
  56 -- >>> splitBy (Chars 0) (pack "abcde")
  57 -- ["a","b","c","d","e"]
  58 --
  59 -- >>> splitBy (Chars 1) (pack "abcde")
  60 -- ["ab","bc","cd","de"]
  61 --
  62 -- >>> splitBy (Chars 2) (pack "abcde")
  63 -- ["abc","bcd","cde"]
  64 splitBy :: SplitContext -> Text -> [Text]
  65 splitBy (Chars     n)  = map pack        . chunkAlong (n+1) 1 . unpack
  66 splitBy (Sentences n)  = map unsentences . chunkAlong (n+1) 1 . sentences
  67 splitBy (Paragraphs _) = map unTag       . filter isTagText   . parseTags
  68   where
  69     unTag :: IsString p => Tag p -> p
  70     unTag (TagText x) = x
  71     unTag _           = ""
  72