]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Terms/WithList.hs
[WithList] adding labelPolicy.
[gargantext.git] / src / Gargantext / Text / Terms / WithList.hs
1 {-|
2 Module : Gargantext.Text.Terms.WithList
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Here is a longer description of this module, containing some
11 commentary with @some markup@.
12
13 -}
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE BangPatterns #-}
16
17 module Gargantext.Text.Terms.WithList where
18
19 import Prelude (String)
20 import qualified Data.Algorithms.KMP as KMP
21 import Data.Char (isSpace)
22 import qualified Data.Text as T
23 import Data.Text (Text)
24 import qualified Data.IntMap.Strict as IntMap
25
26 import Gargantext.Text.Context
27 import Gargantext.Text.Terms.Mono (monoTextsBySentence)
28
29 import Gargantext.Prelude
30 import Data.List (concatMap)
31
32
33
34 type Pattern = KMP.Table Term
35 type TermList = [(Label, [[Term]])]
36 type Patterns = [(Pattern, Int, Label)]
37
38
39 replaceTerms :: (Term -> Label) -> Patterns -> Sentence Term -> Sentence Label
40 replaceTerms labelPolicy pats terms = go 0 terms
41 where
42 go _ [] = []
43 go !ix (t:ts) =
44 case IntMap.lookup ix m of
45 Nothing -> t : go (ix + 1) ts
46 Just (len, label) ->
47 label : go (ix + len) (drop (len - 1) ts)
48
49 -- | merge with labelPolicy (can be a Map Term label)
50 merge (len1, lab1) (len2, lab2) =
51 if (labelPolicy lab1) == lab2 then (len2, lab2) else (len1, lab1)
52
53 m =
54 IntMap.fromListWith merge
55 [ (ix, (len, label))
56 | (pat, len, label) <- pats, ix <- KMP.match pat terms ]
57
58 buildPatterns :: TermList -> Patterns
59 buildPatterns = concatMap buildPattern
60 where
61 buildPattern (label, alts) = map f alts
62 where
63 f alt = (KMP.build alt, length alt, label)
64
65 extractTermsWithList :: (Term -> Label) -> Patterns -> Text -> Corpus Label
66 extractTermsWithList labelPolicy pats = map (replaceTerms labelPolicy pats) . monoTextsBySentence