2 Module : Gargantext.Text.Context
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Context of text management tool, here are logic of main types.
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
17 module Gargantext.Text.Context
20 import Data.Text (Text, pack, unpack)
21 import Data.String (IsString)
23 import Text.HTML.TagSoup (parseTags, isTagText, Tag(..))
24 import Gargantext.Text
25 import Gargantext.Prelude hiding (length)
28 ------------------------------------------------------------------------
33 type TermList = [(Label, [[Term]])]
35 type Sentence a = [a] -- or a nominal group
36 type Corpus a = [Sentence a] -- a list of sentences
38 -- type ConText a = [Sentence a]
39 -- type Corpus a = [ConText a]
42 ------------------------------------------------------------------------
44 data SplitContext = Chars Int | Sentences Int | Paragraphs Int
46 tag :: Text -> [Tag Text]
49 -- | splitBy contexts of Chars or Sentences or Paragraphs
50 -- >> splitBy (Chars 0) "abcde"
51 -- ["a","b","c","d","e"]
52 -- >> splitBy (Chars 1) "abcde"
53 -- ["ab","bc","cd","de"]
54 -- >> splitBy (Chars 2) "abcde"
55 -- ["abc","bcd","cde"]
56 splitBy :: SplitContext -> Text -> [Text]
57 splitBy (Chars n) = map pack . chunkAlong (n+1) 1 . unpack
58 splitBy (Sentences n) = map unsentences . chunkAlong (n+1) 1 . sentences
59 splitBy (Paragraphs _) = map unTag . filter isTagText . tag
61 unTag :: IsString p => Tag p -> p