]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Terms/WithList.hs
[FIX] encoding update node
[gargantext.git] / src / Gargantext / Text / Terms / WithList.hs
1 {-|
2 Module : Gargantext.Text.Terms.WithList
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Here is a longer description of this module, containing some
11 commentary with @some markup@.
12
13 -}
14 {-# LANGUAGE BangPatterns #-}
15
16 module Gargantext.Text.Terms.WithList where
17
18 import Data.List (null, concatMap)
19 import Data.Ord
20 import Data.Text (Text, concat)
21 import Gargantext.Prelude
22 import Gargantext.Text.Context
23 import Gargantext.Text.Terms.Mono (monoTextsBySentence)
24 import Prelude (error)
25 import qualified Data.Algorithms.KMP as KMP
26 import qualified Data.IntMap.Strict as IntMap
27
28 ------------------------------------------------------------------------
29
30 data Pattern = Pattern
31 { _pat_table :: !(KMP.Table Text)
32 , _pat_length :: !Int
33 , _pat_terms :: ![Text]
34 }
35 type Patterns = [Pattern]
36
37 ------------------------------------------------------------------------
38 replaceTerms :: Patterns -> [Text] -> [[Text]]
39 replaceTerms pats terms = go 0
40 where
41 terms_len = length terms
42
43 go ix | ix >= terms_len = []
44 | otherwise =
45 case IntMap.lookup ix m of
46 Nothing -> go (ix + 1)
47 Just (len, term) ->
48 term : go (ix + len)
49
50
51 merge (len1, lab1) (len2, lab2) =
52 if len2 < len1 then (len1, lab1) else (len2, lab2)
53
54 m =
55 IntMap.fromListWith merge
56 [ (ix, (len, term))
57 | Pattern pat len term <- pats, ix <- KMP.match pat terms ]
58
59 buildPatterns :: TermList -> Patterns
60 buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern
61 where
62 buildPattern (label, alts) = map f (label : alts)
63 where
64 f alt | "" `elem` alt = error "buildPatterns: ERR1"
65 | null alt = error "buildPatterns: ERR2"
66 | otherwise =
67 Pattern (KMP.build alt) (length alt) label
68 --(Terms label $ Set.empty) -- TODO check stems
69
70 extractTermsWithList :: Patterns -> Text -> Corpus [Text]
71 extractTermsWithList pats = map (replaceTerms pats) . monoTextsBySentence
72
73 -- | Extract terms
74 -- >>> let termList = [(["chat blanc"], [["chat","blanc"]])] :: TermList
75 -- extractTermsWithList' (buildPatterns termList) "Le chat blanc"["chat blanc"]
76 -- ["chat blanc"]
77 extractTermsWithList' :: Patterns -> Text -> [Text]
78 extractTermsWithList' pats = map (concat . map concat . replaceTerms pats)
79 . monoTextsBySentence
80
81
82 filterWith :: TermList
83 -> (a -> Text)
84 -> [a]
85 -> [(a, [Text])]
86 filterWith termList f xs = filterWith' termList f zip xs
87
88
89 filterWith' :: TermList
90 -> (a -> Text)
91 -> ([a] -> [[Text]] -> [b])
92 -> [a]
93 -> [b]
94 filterWith' termList f f' xs = f' xs
95 $ map (extractTermsWithList' pats)
96 $ map f xs
97 where
98 pats = buildPatterns termList
99