]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Terms/WithList.hs
[FIX] Arxiv api is working for first batch, not the others. I have integrated the...
[gargantext.git] / src / Gargantext / Core / Text / Terms / WithList.hs
1 {-|
2 Module : Gargantext.Core.Text.Terms.WithList
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Here is a longer description of this module, containing some
11 commentary with @some markup@.
12
13 -}
14 {-# LANGUAGE BangPatterns #-}
15
16 module Gargantext.Core.Text.Terms.WithList where
17
18 import Data.List (null)
19 import Data.Ord
20 import Data.Text (Text, concat, unwords)
21 import Gargantext.Prelude
22 import Gargantext.Core.Text.Context
23 import Gargantext.Core.Text.Terms.Mono (monoTextsBySentence)
24 import Prelude (error)
25 import qualified Data.Algorithms.KMP as KMP
26 import qualified Data.IntMap.Strict as IntMap
27 import qualified Data.List as List
28 ------------------------------------------------------------------------
29
30 data Pattern = Pattern
31 { _pat_table :: !(KMP.Table Text)
32 , _pat_length :: !Int
33 , _pat_terms :: ![Text]
34 }
35 type Patterns = [Pattern]
36
37 ------------------------------------------------------------------------
38 replaceTerms :: Patterns -> [Text] -> [[Text]]
39 replaceTerms pats terms = go 0
40 where
41 terms_len = length terms
42
43 go ix | ix >= terms_len = []
44 | otherwise =
45 case IntMap.lookup ix m of
46 Nothing -> go (ix + 1)
47 Just (len, term) ->
48 term : go (ix + len)
49
50
51 merge (len1, lab1) (len2, lab2) =
52 if len2 < len1 then (len1, lab1) else (len2, lab2)
53
54 m =
55 IntMap.fromListWith merge
56 [ (ix, (len, term))
57 | Pattern pat len term <- pats, ix <- KMP.match pat terms ]
58
59 buildPatterns :: TermList -> Patterns
60 buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern
61 where
62 buildPattern (label, alts) = map f $ map (\alt -> filter (/= "") alt) (label : alts)
63 where
64 f alt | "" `elem` alt = error ("buildPatterns: ERR1" <> show(label))
65 | null alt = error "buildPatterns: ERR2"
66 | otherwise =
67 Pattern (KMP.build alt) (length alt) label
68 --(Terms label $ Set.empty) -- TODO check stems
69
70
71 --------------------------------------------------------------------------
72 -- Utils
73 type BlockText = Text
74 type MatchedText = Text
75 termsInText :: Patterns -> BlockText -> [MatchedText]
76 termsInText pats txt = List.nub
77 $ List.concat
78 $ map (map unwords)
79 $ extractTermsWithList pats txt
80
81 --------------------------------------------------------------------------
82
83 extractTermsWithList :: Patterns -> Text -> Corpus [Text]
84 extractTermsWithList pats = map (replaceTerms pats) . monoTextsBySentence
85
86 -- | Extract terms
87 -- >>> let termList = [(["chat blanc"], [["chat","blanc"]])] :: TermList
88 -- extractTermsWithList' (buildPatterns termList) "Le chat blanc"["chat blanc"]
89 -- ["chat blanc"]
90 extractTermsWithList' :: Patterns -> Text -> [Text]
91 extractTermsWithList' pats = map (concat . map concat . replaceTerms pats)
92 . monoTextsBySentence
93
94 --------------------------------------------------------------------------
95
96 {- | Not used
97 filterWith :: TermList
98 -> (a -> Text)
99 -> [a]
100 -> [(a, [Text])]
101 filterWith termList f xs = filterWith' termList f zip xs
102
103
104 filterWith' :: TermList
105 -> (a -> Text)
106 -> ([a] -> [[Text]] -> [b])
107 -> [a]
108 -> [b]
109 filterWith' termList f f' xs = f' xs
110 $ map (extractTermsWithList' pats)
111 $ map f xs
112 where
113 pats = buildPatterns termList
114 -}