]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Context.hs
[Pipeline] ok until clustering.
[gargantext.git] / src / Gargantext / Text / Context.hs
1 {-|
2 Module : Gargantext.Text.Context
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Context of text management tool
11
12 -}
13
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16
17 module Gargantext.Text.Context
18 where
19
20 import Data.Text (Text, pack, unpack, length)
21 import Data.String (IsString)
22
23 import Text.HTML.TagSoup (parseTags, isTagText, Tag(..))
24 import Gargantext.Text
25 import Gargantext.Prelude hiding (length)
26
27
28 data SplitContext = Chars Int | Sentences Int | Paragraphs Int
29
30 tag = parseTags
31 -- | splitBy contexts of Chars or Sentences or Paragraphs
32 -- >> splitBy (Chars 0) "abcde"
33 -- ["a","b","c","d","e"]
34 -- >> splitBy (Chars 1) "abcde"
35 -- ["ab","bc","cd","de"]
36 -- >> splitBy (Chars 2) "abcde"
37 -- ["abc","bcd","cde"]
38 splitBy :: SplitContext -> Text -> [Text]
39 splitBy (Chars n) = map pack . chunkAlong (n+1) 1 . unpack
40 splitBy (Sentences n) = map unsentences . chunkAlong (n+1) 1 . sentences
41 splitBy (Paragraphs _) = map unTag . filter isTagText . tag
42 where
43 unTag :: IsString p => Tag p -> p
44 unTag (TagText x) = x
45 unTag _ = ""
46
47