]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Terms/WithList.hs
install: proposal
[gargantext.git] / src / Gargantext / Text / Terms / WithList.hs
1 {-|
2 Module : Gargantext.Text.Terms.WithList
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Here is a longer description of this module, containing some
11 commentary with @some markup@.
12
13 -}
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE BangPatterns #-}
17
18 module Gargantext.Text.Terms.WithList where
19
20 import qualified Data.Algorithms.KMP as KMP
21 import Data.Text (Text)
22 import qualified Data.IntMap.Strict as IntMap
23
24 import Gargantext.Core.Types (Terms(..))
25 import Gargantext.Text.Context
26 import Gargantext.Text.Terms.Mono (monoTextsBySentence)
27
28 import Prelude (error)
29 import Gargantext.Prelude
30 import Data.List (null, concatMap)
31 import Data.Ord
32 import qualified Data.Set as Set
33
34
35 ------------------------------------------------------------------------
36
37 data Pattern = Pattern
38 { _pat_table :: !(KMP.Table Term)
39 , _pat_length :: !Int
40 , _pat_terms :: !Terms
41 }
42 type Patterns = [Pattern]
43
44 ------------------------------------------------------------------------
45
46 replaceTerms :: Patterns -> Sentence Term -> Sentence Terms
47 replaceTerms pats terms = go 0
48 where
49 terms_len = length terms
50
51 go ix | ix >= terms_len = []
52 | otherwise =
53 case IntMap.lookup ix m of
54 Nothing -> go (ix + 1)
55 Just (len, term) ->
56 term : go (ix + len)
57
58
59 merge (len1, lab1) (len2, lab2) =
60 if len2 < len1 then (len1, lab1) else (len2, lab2)
61
62 m =
63 IntMap.fromListWith merge
64 [ (ix, (len, term))
65 | Pattern pat len term <- pats, ix <- KMP.match pat terms ]
66
67 buildPatterns :: TermList -> Patterns
68 buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern
69 where
70 buildPattern (label, alts) = map f (label : alts)
71 where
72 f alt | "" `elem` alt = error "buildPatterns: ERR1"
73 | null alt = error "buildPatterns: ERR2"
74 | otherwise =
75 Pattern (KMP.build alt) (length alt)
76 (Terms label $ Set.empty) -- TODO check stems
77
78 extractTermsWithList :: Patterns -> Text -> Corpus Terms
79 extractTermsWithList pats = map (replaceTerms pats) . monoTextsBySentence