]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Terms/WithList.hs
More WithList fixes
[gargantext.git] / src / Gargantext / Text / Terms / WithList.hs
1 {-|
2 Module : Gargantext.Text.Terms.WithList
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Here is a longer description of this module, containing some
11 commentary with @some markup@.
12
13 -}
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE BangPatterns #-}
16
17 module Gargantext.Text.Terms.WithList where
18
19 import qualified Data.Algorithms.KMP as KMP
20 import Data.Text (Text)
21 import qualified Data.IntMap.Strict as IntMap
22
23 import Gargantext.Core.Types (Terms(..))
24 import Gargantext.Text.Context
25 import Gargantext.Text.Terms.Mono (monoTextsBySentence)
26
27 import Gargantext.Prelude
28 import Data.List (concatMap)
29 import Data.Ord
30 import qualified Data.Set as Set
31
32
33 ------------------------------------------------------------------------
34
35 data Pattern = Pattern
36 { _pat_table :: !(KMP.Table Term)
37 , _pat_length :: !Int
38 , _pat_terms :: !Terms
39 }
40 type Patterns = [Pattern]
41
42 ------------------------------------------------------------------------
43
44 replaceTerms :: Patterns -> Sentence Term -> Sentence Terms
45 replaceTerms pats terms = go 0
46 where
47 terms_len = length terms
48
49 go ix | ix >= terms_len = []
50 | otherwise =
51 case IntMap.lookup ix m of
52 Nothing -> go (ix + 1)
53 Just (len, term) ->
54 term : go (ix + len)
55
56
57 merge (len1, lab1) (len2, lab2) =
58 if len2 < len1 then (len1, lab1) else (len2, lab2)
59
60 m =
61 IntMap.fromListWith merge
62 [ (ix, (len, term))
63 | Pattern pat len term <- pats, ix <- KMP.match pat (_terms_label term) ]
64
65 buildPatterns :: TermList -> Patterns
66 buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern
67 where
68 buildPattern (label, alts) = map f (label : alts)
69 where
70 f alt = Pattern (KMP.build alt) (length alt)
71 (Terms label $ Set.empty) -- TODO check stems
72
73 extractTermsWithList :: Patterns -> Text -> Corpus Terms
74 extractTermsWithList pats = map (replaceTerms pats) . monoTextsBySentence