]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Context.hs
install: proposal
[gargantext.git] / src / Gargantext / Text / Context.hs
1 {-|
2 Module : Gargantext.Text.Context
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Context of text management tool, here are logic of main types.
11
12 -}
13
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16
17 module Gargantext.Text.Context
18 where
19
20 import Data.Text (Text, pack, unpack)
21 import Data.String (IsString)
22
23 import Text.HTML.TagSoup (parseTags, isTagText, Tag(..))
24 import Gargantext.Text
25 import Gargantext.Prelude hiding (length)
26
27
28 ------------------------------------------------------------------------
29
30 type Term = Text
31 type MultiTerm = [Term]
32 type Label = MultiTerm
33
34 type TermList = [(Label, [MultiTerm])]
35
36 type Sentence a = [a] -- or a nominal group
37 type Corpus a = [Sentence a] -- a list of sentences
38
39 -- type ConText a = [Sentence a]
40 -- type Corpus a = [ConText a]
41
42
43 ------------------------------------------------------------------------
44
45 data SplitContext = Chars Int | Sentences Int | Paragraphs Int
46
47 tag :: Text -> [Tag Text]
48 tag = parseTags
49
50 -- | splitBy contexts of Chars or Sentences or Paragraphs
51 -- >> splitBy (Chars 0) "abcde"
52 -- ["a","b","c","d","e"]
53 -- >> splitBy (Chars 1) "abcde"
54 -- ["ab","bc","cd","de"]
55 -- >> splitBy (Chars 2) "abcde"
56 -- ["abc","bcd","cde"]
57 splitBy :: SplitContext -> Text -> [Text]
58 splitBy (Chars n) = map pack . chunkAlong (n+1) 1 . unpack
59 splitBy (Sentences n) = map unsentences . chunkAlong (n+1) 1 . sentences
60 splitBy (Paragraphs _) = map unTag . filter isTagText . tag
61 where
62 unTag :: IsString p => Tag p -> p
63 unTag (TagText x) = x
64 unTag _ = ""
65
66