src/Gargantext/Text/Context.hs

   1 {-|
   2 Module      : Gargantext.Text.Context
   3 Description : How to manage contexts of texts ?
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Context of text management tool, here are logic of main types:
  11
  12 - Term
  13 - Multi-term
  14 - Label
  15 - Sentence
  16 - Corpus
  17
  18 How to split contexts is describes in this module.
  19
  20 -}
  21
  22 {-# LANGUAGE NoImplicitPrelude #-}
  23 {-# LANGUAGE OverloadedStrings #-}
  24
  25 module Gargantext.Text.Context
  26   where
  27
  28 import Data.Text (Text, pack, unpack)
  29 import Data.String (IsString)
  30
  31 import Text.HTML.TagSoup (parseTags, isTagText, Tag(..))
  32 import Gargantext.Text
  33 import Gargantext.Prelude hiding (length)
  34
  35 ------------------------------------------------------------------------
  36 type Term = Text
  37 type MultiTerm = [Term]
  38 type Label = MultiTerm
  39
  40 type TermList = [(Label, [MultiTerm])]
  41
  42 type Sentence  a = [a] -- or a nominal group
  43 type Corpus    a = [Sentence a] -- a list of sentences
  44
  45 -- type ConText a = [Sentence a]
  46 -- type Corpus a = [ConText a]
  47 ------------------------------------------------------------------------
  48
  49 -- | Contexts definition to build/unbuild contexts.
  50 data SplitContext = Chars Int | Sentences Int | Paragraphs Int
  51
  52 tag :: Text -> [Tag Text]
  53 tag = parseTags
  54
  55 -- | splitBy contexts of Chars or Sentences or Paragraphs
  56 -- To see some examples at a higher level (sentences and paragraph), see
  57 -- 'Gargantext.Text.Examples.ex_terms'
  58 --
  59 -- >>> splitBy (Chars 0) (pack "abcde")
  60 -- ["a","b","c","d","e"]
  61 --
  62 -- >>> splitBy (Chars 1) (pack "abcde")
  63 -- ["ab","bc","cd","de"]
  64 --
  65 -- >>> splitBy (Chars 2) (pack "abcde")
  66 -- ["abc","bcd","cde"]
  67 splitBy :: SplitContext -> Text -> [Text]
  68 splitBy (Chars     n)  = map pack        . chunkAlong (n+1) 1 . unpack
  69 splitBy (Sentences n)  = map unsentences . chunkAlong (n+1) 1 . sentences
  70 splitBy (Paragraphs _) = map unTag       . filter isTagText   . tag
  71   where
  72     unTag :: IsString p => Tag p -> p
  73     unTag (TagText x) = x
  74     unTag _           = ""
  75
  76