]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Context.hs
[REFACT] before scoring new ngrams lists.
[gargantext.git] / src / Gargantext / Text / Context.hs
1 {-|
2 Module : Gargantext.Text.Context
3 Description : How to manage contexts of texts ?
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Context of text management tool, here are logic of main types:
11
12 - Term
13 - Multi-term
14 - Label
15 - Sentence
16 - Corpus
17
18 How to split contexts is describes in this module.
19
20 -}
21
22 {-# LANGUAGE NoImplicitPrelude #-}
23 {-# LANGUAGE OverloadedStrings #-}
24
25 module Gargantext.Text.Context
26 where
27
28 import Data.Text (Text, pack, unpack)
29 import Data.String (IsString)
30
31 import Text.HTML.TagSoup (parseTags, isTagText, Tag(..))
32 import Gargantext.Text
33 import Gargantext.Prelude hiding (length)
34
35 ------------------------------------------------------------------------
36 type Term = Text
37 type MultiTerm = [Term]
38 type Label = MultiTerm
39
40 type TermList = [(Label, [MultiTerm])]
41
42 type Sentence a = [a] -- or a nominal group
43 type Corpus a = [Sentence a] -- a list of sentences
44
45 -- type ConText a = [Sentence a]
46 -- type Corpus a = [ConText a]
47 ------------------------------------------------------------------------
48
49 -- | Contexts definition to build/unbuild contexts.
50 data SplitContext = Chars Int | Sentences Int | Paragraphs Int
51
52 tag :: Text -> [Tag Text]
53 tag = parseTags
54
55 -- | splitBy contexts of Chars or Sentences or Paragraphs
56 -- To see some examples at a higher level (sentences and paragraph), see
57 -- 'Gargantext.Text.Examples.ex_terms'
58 --
59 -- >>> splitBy (Chars 0) (pack "abcde")
60 -- ["a","b","c","d","e"]
61 --
62 -- >>> splitBy (Chars 1) (pack "abcde")
63 -- ["ab","bc","cd","de"]
64 --
65 -- >>> splitBy (Chars 2) (pack "abcde")
66 -- ["abc","bcd","cde"]
67 splitBy :: SplitContext -> Text -> [Text]
68 splitBy (Chars n) = map pack . chunkAlong (n+1) 1 . unpack
69 splitBy (Sentences n) = map unsentences . chunkAlong (n+1) 1 . sentences
70 splitBy (Paragraphs _) = map unTag . filter isTagText . tag
71 where
72 unTag :: IsString p => Tag p -> p
73 unTag (TagText x) = x
74 unTag _ = ""
75
76