]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Context.hs
[Index with TermList] compiles but weird behavior.
[gargantext.git] / src / Gargantext / Text / Context.hs
1 {-|
2 Module : Gargantext.Text.Context
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Context of text management tool, here are logic of main types.
11
12 -}
13
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16
17 module Gargantext.Text.Context
18 where
19
20 import Data.Text (Text, pack, unpack)
21 import Data.String (IsString)
22
23 import Text.HTML.TagSoup (parseTags, isTagText, Tag(..))
24 import Gargantext.Text
25 import Gargantext.Prelude hiding (length)
26
27
28 ------------------------------------------------------------------------
29
30 type Term = Text
31 type Label = Term
32
33 type TermList = [(Label, [[Term]])]
34
35 type Sentence a = [a] -- or a nominal group
36 type Corpus a = [Sentence a] -- a list of sentences
37
38 -- type ConText a = [Sentence a]
39 -- type Corpus a = [ConText a]
40
41
42 ------------------------------------------------------------------------
43
44 data SplitContext = Chars Int | Sentences Int | Paragraphs Int
45
46 tag :: Text -> [Tag Text]
47 tag = parseTags
48
49 -- | splitBy contexts of Chars or Sentences or Paragraphs
50 -- >> splitBy (Chars 0) "abcde"
51 -- ["a","b","c","d","e"]
52 -- >> splitBy (Chars 1) "abcde"
53 -- ["ab","bc","cd","de"]
54 -- >> splitBy (Chars 2) "abcde"
55 -- ["abc","bcd","cde"]
56 splitBy :: SplitContext -> Text -> [Text]
57 splitBy (Chars n) = map pack . chunkAlong (n+1) 1 . unpack
58 splitBy (Sentences n) = map unsentences . chunkAlong (n+1) 1 . sentences
59 splitBy (Paragraphs _) = map unTag . filter isTagText . tag
60 where
61 unTag :: IsString p => Tag p -> p
62 unTag (TagText x) = x
63 unTag _ = ""
64
65