]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Context.hs
Some fixes
[gargantext.git] / src / Gargantext / Text / Context.hs
1 {-|
2 Module : Gargantext.Text.Context
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Context of text management tool, here are logic of main types.
11
12 -}
13
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16
17 module Gargantext.Text.Context
18 where
19
20 import Data.Text (Text, pack, unpack)
21 import Data.String (IsString)
22
23 import Text.HTML.TagSoup (parseTags, isTagText, Tag(..))
24 import Gargantext.Text
25 import Gargantext.Prelude hiding (length)
26
27
28 ------------------------------------------------------------------------
29
30 type Term = Text
31 type Label = Term
32
33 type Sentence a = [a] -- or a nominal group
34 type Corpus a = [Sentence a] -- a list of sentences
35
36 -- type ConText a = [Sentence a]
37 -- type Corpus a = [ConText a]
38
39
40 ------------------------------------------------------------------------
41
42 data SplitContext = Chars Int | Sentences Int | Paragraphs Int
43
44 tag :: Text -> [Tag Text]
45 tag = parseTags
46
47 -- | splitBy contexts of Chars or Sentences or Paragraphs
48 -- >> splitBy (Chars 0) "abcde"
49 -- ["a","b","c","d","e"]
50 -- >> splitBy (Chars 1) "abcde"
51 -- ["ab","bc","cd","de"]
52 -- >> splitBy (Chars 2) "abcde"
53 -- ["abc","bcd","cde"]
54 splitBy :: SplitContext -> Text -> [Text]
55 splitBy (Chars n) = map pack . chunkAlong (n+1) 1 . unpack
56 splitBy (Sentences n) = map unsentences . chunkAlong (n+1) 1 . sentences
57 splitBy (Paragraphs _) = map unTag . filter isTagText . tag
58 where
59 unTag :: IsString p => Tag p -> p
60 unTag (TagText x) = x
61 unTag _ = ""
62
63