]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Context.hs
[FIX] Clean Text before sending it to NLP micro services + tests + clean code for...
[gargantext.git] / src / Gargantext / Core / Text / Context.hs
1 {-|
2 Module : Gargantext.Core.Text.Context
3 Description : How to manage contexts of texts ?
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Context of text management tool, here are logic of main types:
11
12 - Term
13 - Multi-term
14 - Label
15 - Sentence
16 - Corpus
17
18 How to split contexts is describes in this module.
19
20 -}
21
22
23 module Gargantext.Core.Text.Context
24 where
25
26 import Data.Text (Text, pack, unpack)
27 import Data.String (IsString)
28
29 import Text.HTML.TagSoup (parseTags, isTagText, Tag(..))
30 import Gargantext.Core.Text
31 import Gargantext.Prelude hiding (length)
32
33 ------------------------------------------------------------------------
34 type Term = Text
35 type MultiTerm = [Term]
36 type Label = MultiTerm
37
38 type TermList = [(Label, [MultiTerm])]
39
40 type Sentence a = [a] -- or a nominal group
41 type Corpus a = [Sentence a] -- a list of sentences
42
43 -- type ConText a = [Sentence a]
44 -- type Corpus a = [ConText a]
45 ------------------------------------------------------------------------
46
47 -- | Contexts definition to build/unbuild contexts.
48 data SplitContext = Chars Int | Sentences Int | Paragraphs Int
49
50 -- | splitBy contexts of Chars or Sentences or Paragraphs
51 -- To see some examples at a higher level (sentences and paragraph), see
52 -- 'Gargantext.Core.Text.Examples.ex_terms'
53 --
54 -- >>> splitBy (Chars 0) (pack "abcde")
55 -- ["a","b","c","d","e"]
56 --
57 -- >>> splitBy (Chars 1) (pack "abcde")
58 -- ["ab","bc","cd","de"]
59 --
60 -- >>> splitBy (Chars 2) (pack "abcde")
61 -- ["abc","bcd","cde"]
62 splitBy :: SplitContext -> Text -> [Text]
63 splitBy (Chars n) = map pack . chunkAlong (n+1) 1 . unpack
64 splitBy (Sentences n) = map unsentences . chunkAlong (n+1) 1 . sentences
65 splitBy (Paragraphs _) = map unTag . filter isTagText . parseTags
66 where
67 unTag :: IsString p => Tag p -> p
68 unTag (TagText x) = x
69 unTag _ = ""
70