2 Module : Gargantext.Text.Context
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Context of text management tool, here are logic of main types.
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
17 module Gargantext.Text.Context
20 import Data.Text (Text, pack, unpack)
21 import Data.String (IsString)
23 import Text.HTML.TagSoup (parseTags, isTagText, Tag(..))
24 import Gargantext.Text
25 import Gargantext.Prelude hiding (length)
28 ------------------------------------------------------------------------
31 type MultiTerm = [Term]
32 type Label = MultiTerm
34 type TermList = [(Label, [MultiTerm])]
36 type Sentence a = [a] -- or a nominal group
37 type Corpus a = [Sentence a] -- a list of sentences
39 -- type ConText a = [Sentence a]
40 -- type Corpus a = [ConText a]
43 ------------------------------------------------------------------------
45 data SplitContext = Chars Int | Sentences Int | Paragraphs Int
47 tag :: Text -> [Tag Text]
50 -- | splitBy contexts of Chars or Sentences or Paragraphs
51 -- >> splitBy (Chars 0) "abcde"
52 -- ["a","b","c","d","e"]
53 -- >> splitBy (Chars 1) "abcde"
54 -- ["ab","bc","cd","de"]
55 -- >> splitBy (Chars 2) "abcde"
56 -- ["abc","bcd","cde"]
57 splitBy :: SplitContext -> Text -> [Text]
58 splitBy (Chars n) = map pack . chunkAlong (n+1) 1 . unpack
59 splitBy (Sentences n) = map unsentences . chunkAlong (n+1) 1 . sentences
60 splitBy (Paragraphs _) = map unTag . filter isTagText . tag
62 unTag :: IsString p => Tag p -> p