]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Context.hs
Merge branch 'dev' into stable
[gargantext.git] / src / Gargantext / Text / Context.hs
1 {-|
2 Module : Gargantext.Text.Context
3 Description : How to manage contexts of texts ?
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Context of text management tool, here are logic of main types:
11
12 - Term
13 - Multi-term
14 - Label
15 - Sentence
16 - Corpus
17
18 How to split contexts is describes in this module.
19
20 -}
21
22 {-# LANGUAGE NoImplicitPrelude #-}
23 {-# LANGUAGE OverloadedStrings #-}
24
25 module Gargantext.Text.Context
26 where
27
28 import Data.Text (Text, pack, unpack)
29 import Data.String (IsString)
30
31 import Text.HTML.TagSoup (parseTags, isTagText, Tag(..))
32 import Gargantext.Text
33 import Gargantext.Prelude hiding (length)
34
35 ------------------------------------------------------------------------
36 type Term = Text
37 type MultiTerm = [Term]
38 type Label = MultiTerm
39
40 type TermList = [(Label, [MultiTerm])]
41
42 type Sentence a = [a] -- or a nominal group
43 type Corpus a = [Sentence a] -- a list of sentences
44
45 -- type ConText a = [Sentence a]
46 -- type Corpus a = [ConText a]
47 ------------------------------------------------------------------------
48
49 -- | Contexts definition to build/unbuild contexts.
50 data SplitContext = Chars Int | Sentences Int | Paragraphs Int
51
52 -- | splitBy contexts of Chars or Sentences or Paragraphs
53 -- To see some examples at a higher level (sentences and paragraph), see
54 -- 'Gargantext.Text.Examples.ex_terms'
55 --
56 -- >>> splitBy (Chars 0) (pack "abcde")
57 -- ["a","b","c","d","e"]
58 --
59 -- >>> splitBy (Chars 1) (pack "abcde")
60 -- ["ab","bc","cd","de"]
61 --
62 -- >>> splitBy (Chars 2) (pack "abcde")
63 -- ["abc","bcd","cde"]
64 splitBy :: SplitContext -> Text -> [Text]
65 splitBy (Chars n) = map pack . chunkAlong (n+1) 1 . unpack
66 splitBy (Sentences n) = map unsentences . chunkAlong (n+1) 1 . sentences
67 splitBy (Paragraphs _) = map unTag . filter isTagText . parseTags
68 where
69 unTag :: IsString p => Tag p -> p
70 unTag (TagText x) = x
71 unTag _ = ""
72