]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Terms/WithList.hs
[ngrams] first version of ngrams counting in docs
[gargantext.git] / src / Gargantext / Core / Text / Terms / WithList.hs
1 {-|
2 Module : Gargantext.Core.Text.Terms.WithList
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Here is a longer description of this module, containing some
11 commentary with @some markup@.
12
13 -}
14 {-# LANGUAGE BangPatterns #-}
15
16 module Gargantext.Core.Text.Terms.WithList where
17
18 import Data.List (null)
19 import Data.Ord
20 import Data.Text (Text, concat, unwords)
21 import Gargantext.Prelude
22 import Gargantext.Core.Text.Context
23 import Gargantext.Core.Text.Terms.Mono (monoTextsBySentence)
24 import Gargantext.Core.Types (TermsCount)
25 import Gargantext.Core.Utils (groupWithCounts)
26 import Prelude (error)
27 import qualified Data.Algorithms.KMP as KMP
28 import qualified Data.IntMap.Strict as IntMap
29 import qualified Data.List as List
30 ------------------------------------------------------------------------
31
32 data Pattern = Pattern
33 { _pat_table :: !(KMP.Table Text)
34 , _pat_length :: !Int
35 , _pat_terms :: ![Text]
36 }
37 type Patterns = [Pattern]
38
39 ------------------------------------------------------------------------
40 replaceTerms :: Patterns -> [Text] -> [[Text]]
41 replaceTerms pats terms = go 0
42 where
43 terms_len = length terms
44
45 go ix | ix >= terms_len = []
46 | otherwise =
47 case IntMap.lookup ix m of
48 Nothing -> go (ix + 1)
49 Just (len, term) ->
50 term : go (ix + len)
51
52
53 merge (len1, lab1) (len2, lab2) =
54 if len2 < len1 then (len1, lab1) else (len2, lab2)
55
56 m =
57 IntMap.fromListWith merge
58 [ (ix, (len, term))
59 | Pattern pat len term <- pats, ix <- KMP.match pat terms ]
60
61 buildPatterns :: TermList -> Patterns
62 buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern
63 where
64 buildPattern (label, alts) = map f $ map (\alt -> filter (/= "") alt) (label : alts)
65 where
66 f alt | "" `elem` alt = error ("buildPatterns: ERR1" <> show(label))
67 | null alt = error "buildPatterns: ERR2"
68 | otherwise =
69 Pattern (KMP.build alt) (length alt) label
70 --(Terms label $ Set.empty) -- TODO check stems
71
72
73 --------------------------------------------------------------------------
74 -- Utils
75 type BlockText = Text
76 type MatchedText = Text
77 termsInText :: Patterns -> BlockText -> [(MatchedText, TermsCount)]
78 termsInText pats txt = groupWithCounts
79 $ List.concat
80 $ map (map unwords)
81 $ extractTermsWithList pats txt
82
83 --------------------------------------------------------------------------
84
85 extractTermsWithList :: Patterns -> Text -> Corpus [Text]
86 extractTermsWithList pats = map (replaceTerms pats) . monoTextsBySentence
87
88 -- | Extract terms
89 -- >>> let termList = [(["chat blanc"], [["chat","blanc"]])] :: TermList
90 -- extractTermsWithList' (buildPatterns termList) "Le chat blanc"["chat blanc"]
91 -- ["chat blanc"]
92 extractTermsWithList' :: Patterns -> Text -> [Text]
93 extractTermsWithList' pats = map (concat . map concat . replaceTerms pats)
94 . monoTextsBySentence
95
96 --------------------------------------------------------------------------
97
98 {- | Not used
99 filterWith :: TermList
100 -> (a -> Text)
101 -> [a]
102 -> [(a, [Text])]
103 filterWith termList f xs = filterWith' termList f zip xs
104
105
106 filterWith' :: TermList
107 -> (a -> Text)
108 -> ([a] -> [[Text]] -> [b])
109 -> [a]
110 -> [b]
111 filterWith' termList f f' xs = f' xs
112 $ map (extractTermsWithList' pats)
113 $ map f xs
114 where
115 pats = buildPatterns termList
116 -}