]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Terms/WithList.hs
Cleaner Jobs API
[gargantext.git] / src / Gargantext / Core / Text / Terms / WithList.hs
1 {-|
2 Module : Gargantext.Core.Text.Terms.WithList
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Here is a longer description of this module, containing some
11 commentary with @some markup@.
12
13 -}
14 {-# LANGUAGE BangPatterns #-}
15
16 module Gargantext.Core.Text.Terms.WithList where
17
18 import Data.List (null)
19 import Data.Ord
20 import Data.Text (Text, concat, unwords)
21 import Gargantext.Prelude
22 import Gargantext.Core.Text.Context
23 import Gargantext.Core.Text.Terms.Mono (monoTextsBySentence)
24 import Gargantext.Core.Types (TermsCount)
25 import Gargantext.Core.Utils (groupWithCounts)
26 import Prelude (error)
27 import qualified Data.Algorithms.KMP as KMP
28 import qualified Data.IntMap.Strict as IntMap
29 import qualified Data.List as List
30 ------------------------------------------------------------------------
31
32 data Pattern = Pattern
33 { _pat_table :: !(KMP.Table Text)
34 , _pat_length :: !Int
35 , _pat_terms :: ![Text]
36 }
37 type Patterns = [Pattern]
38
39 ------------------------------------------------------------------------
40
41 data ReplaceTerms = KeepAll | LongestOnly
42
43 replaceTerms :: ReplaceTerms -> Patterns -> [Text] -> [[Text]]
44 replaceTerms rplaceTerms pats terms = go 0
45 where
46 terms_len = length terms
47
48 go ix | ix >= terms_len = []
49 | otherwise =
50 case IntMap.lookup ix m of
51 Nothing -> go (ix + 1)
52 Just (len, term) ->
53 term : go (ix + len)
54
55 m = toMap
56 [ (ix, (len, term))
57 | Pattern pat len term <- pats, ix <- KMP.match pat terms ]
58
59 toMap = case rplaceTerms of
60 KeepAll -> IntMap.fromList
61 LongestOnly -> IntMap.fromListWith merge
62 where
63 merge (len1, lab1) (len2, lab2) =
64 if len2 < len1 then (len1, lab1) else (len2, lab2)
65
66 buildPatterns :: TermList -> Patterns
67 buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern
68 where
69 buildPattern (label, alts) = map f $ map (\alt -> filter (/= "") alt) (label : alts)
70 where
71 f alt | "" `elem` alt = error ("buildPatterns: ERR1" <> show(label))
72 | null alt = error "buildPatterns: ERR2"
73 | otherwise =
74 Pattern (KMP.build alt) (length alt) label
75 --(Terms label $ Set.empty) -- TODO check stems
76
77
78 --------------------------------------------------------------------------
79 -- Utils
80 type MatchedText = Text
81 termsInText :: Patterns -> Text -> [(MatchedText, TermsCount)]
82 termsInText pats txt = groupWithCounts
83 $ List.concat
84 $ map (map unwords)
85 $ extractTermsWithList pats txt
86
87 --------------------------------------------------------------------------
88
89 extractTermsWithList :: Patterns -> Text -> Corpus [Text]
90 extractTermsWithList pats = map (replaceTerms KeepAll pats) . monoTextsBySentence
91
92 -- | Extract terms
93 -- >>> let termList = [(["chat blanc"], [["chat","blanc"]])] :: TermList
94 -- extractTermsWithList' (buildPatterns termList) "Le chat blanc"["chat blanc"]
95 -- ["chat blanc"]
96 extractTermsWithList' :: Patterns -> Text -> [Text]
97 extractTermsWithList' pats = map (concat . map concat . replaceTerms KeepAll pats)
98 . monoTextsBySentence
99
100 --------------------------------------------------------------------------
101
102 {- | Not used
103 filterWith :: TermList
104 -> (a -> Text)
105 -> [a]
106 -> [(a, [Text])]
107 filterWith termList f xs = filterWith' termList f zip xs
108
109
110 filterWith' :: TermList
111 -> (a -> Text)
112 -> ([a] -> [[Text]] -> [b])
113 -> [a]
114 -> [b]
115 filterWith' termList f f' xs = f' xs
116 $ map (extractTermsWithList' pats)
117 $ map f xs
118 where
119 pats = buildPatterns termList
120 -}