]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Terms/WithList.hs
[Index with TermList] compiles but weird behavior.
[gargantext.git] / src / Gargantext / Text / Terms / WithList.hs
1 {-|
2 Module : Gargantext.Text.Terms.WithList
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Here is a longer description of this module, containing some
11 commentary with @some markup@.
12
13 -}
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE BangPatterns #-}
16
17 module Gargantext.Text.Terms.WithList where
18
19 import qualified Data.Algorithms.KMP as KMP
20 import Data.Text (Text)
21 import qualified Data.IntMap.Strict as IntMap
22
23 import Gargantext.Text.Context
24 import Gargantext.Text.Terms.Mono (monoTextsBySentence)
25
26 import Gargantext.Prelude
27 import Data.List (concatMap)
28
29
30 ------------------------------------------------------------------------
31
32 type Pattern = KMP.Table Term
33 type Patterns = [(Pattern, Int, Label)]
34
35 ------------------------------------------------------------------------
36
37 replaceTerms :: Patterns -> Sentence Term -> Sentence Label
38 replaceTerms pats terms = go 0 terms
39 where
40 go _ [] = []
41 go !ix (t:ts) =
42 case IntMap.lookup ix m of
43 Nothing -> t : go (ix + 1) ts
44 Just (len, label) ->
45 label : go (ix + len) (drop (len - 1) ts)
46
47 merge (len1, lab1) (len2, lab2) =
48 if len2 < len1 then (len1, lab1) else (len2, lab2)
49
50 m =
51 IntMap.fromListWith merge
52 [ (ix, (len, label))
53 | (pat, len, label) <- pats, ix <- KMP.match pat terms ]
54
55 buildPatterns :: TermList -> Patterns
56 buildPatterns = concatMap buildPattern
57 where
58 buildPattern (label, alts) = map f alts
59 where
60 f alt = (KMP.build alt, length alt, label)
61
62 extractTermsWithList :: Patterns -> Text -> Corpus Label
63 extractTermsWithList pats = map (replaceTerms pats) . monoTextsBySentence