]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/List.hs
[FACTORING] G.Text.Terms.
[gargantext.git] / src / Gargantext / Text / List.hs
1 {-|
2 Module : Gargantext.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE OverloadedStrings #-}
15 {-# LANGUAGE RankNTypes #-}
16
17 module Gargantext.Text.List
18 where
19
20 import Data.Either (partitionEithers, Either(..))
21 import Data.Map (Map)
22 import Data.Set (Set)
23 import Data.Text (Text)
24 import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, RootParent(..), mSetFromList)
25 import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
26 import Gargantext.Core (Lang(..))
27 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId, NodeId)
28 import Gargantext.Database.Action.Metrics.NgramsByNode (getTficf', sortTficf, ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith)
29 import Gargantext.Database.Admin.Utils (Cmd)
30 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
31 import Gargantext.Prelude
32 import Gargantext.Text.List.Learn (Model(..))
33 import Gargantext.Text.Metrics (takeScored)
34 import qualified Data.Char as Char
35 import qualified Data.List as List
36 import qualified Data.Map as Map
37 import qualified Data.Set as Set
38 import qualified Data.Text as Text
39
40
41 data NgramsListBuilder = BuilderStepO { stemSize :: Int
42 , stemX :: Int
43 , stopSize :: Int
44 }
45 | BuilderStep1 { withModel :: Model }
46 | BuilderStepN { withModel :: Model }
47 | Tficf { nlb_lang :: Lang
48 , nlb_group1 :: Int
49 , nlb_group2 :: Int
50 , nlb_stopSize :: StopSize
51 , nlb_userCorpusId :: UserCorpusId
52 , nlb_masterCorpusId :: MasterCorpusId
53 }
54
55
56 data StopSize = StopSize {unStopSize :: Int}
57
58 -- | TODO improve grouping functions of Authors, Sources, Institutes..
59 buildNgramsLists :: Lang -> Int -> Int -> StopSize -> UserCorpusId -> MasterCorpusId
60 -> Cmd err (Map NgramsType [NgramsElement])
61 buildNgramsLists l n m s uCid mCid = do
62 ngTerms <- buildNgramsTermsList l n m s uCid mCid
63 othersTerms <- mapM (buildNgramsOthersList uCid identity) [Authors, Sources, Institutes]
64 pure $ Map.unions $ othersTerms <> [ngTerms]
65
66
67 buildNgramsOthersList :: UserCorpusId -> (Text -> Text) -> NgramsType
68 -> Cmd err (Map NgramsType [NgramsElement])
69 buildNgramsOthersList uCid groupIt nt = do
70 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
71
72 let
73 listSize = 9
74 all' = List.reverse $ List.sortOn (Set.size . snd . snd) $ Map.toList ngs
75 graphTerms = List.take listSize all'
76 candiTerms = List.drop listSize all'
77 pure $ Map.unionsWith (<>) [ toElements GraphTerm graphTerms
78 , toElements CandidateTerm candiTerms]
79 where
80 toElements nType x = Map.fromList [(nt, [ mkNgramsElement t nType Nothing (mSetFromList [])
81 | (t,_ns) <- x
82 ]
83 )
84 ]
85
86 --{-
87 buildNgramsTermsList' :: UserCorpusId
88 -> (Text -> Text)
89 -> ((Text, (Set Text, Set NodeId)) -> Bool) -> Int -> Int
90 -> Cmd err (Map NgramsType [NgramsElement])
91 --}
92 buildNgramsTermsList' uCid groupIt stop gls is = do
93 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid NgramsTerms
94
95 let
96 (stops, candidates) = partitionEithers
97 $ map (\t -> if stop t then Left t else Right t)
98 $ Map.toList
99 $ Map.filter ((\s' -> Set.size s' > 1) . snd) ngs
100
101 (maps, candidates') = takeScored gls is
102 $ getCoocByNgrams' snd (Diagonal True)
103 $ Map.fromList candidates
104
105
106 toList' t = (fst t, (fromIntegral $ Set.size $ snd $ snd t, fst $ snd t))
107
108 (s,c,m) = (stops
109 , List.filter (\(k,_) -> List.elem k candidates') candidates
110 , List.filter (\(k,_) -> List.elem k maps) candidates
111 )
112
113 let ngs' = List.concat
114 $ map toNgramsElement
115 $ map (\t -> (StopTerm, toList' t)) s
116 <> map (\t -> (CandidateTerm, toList' t)) c
117 <> map (\t -> (GraphTerm, toList' t)) m
118
119 pure $ Map.fromList [(NgramsTerms, ngs')]
120
121
122 buildNgramsTermsList :: Lang -> Int -> Int -> StopSize -> UserCorpusId -> MasterCorpusId
123 -> Cmd err (Map NgramsType [NgramsElement])
124 buildNgramsTermsList l n m s uCid mCid = do
125 candidates <- sortTficf <$> getTficf' uCid mCid NgramsTerms (ngramsGroup l n m)
126 let
127 candidatesSize = 2000
128 a = 10
129 b = 10
130 candidatesHead = List.take candidatesSize candidates
131 candidatesTail = List.drop candidatesSize candidates
132 termList = (toTermList a b ((isStopTerm s) . fst) candidatesHead)
133 <> (map (toList ((isStopTerm s) .fst) CandidateTerm) candidatesTail)
134 let ngs = List.concat $ map toNgramsElement termList
135
136 pure $ Map.fromList [(NgramsTerms, ngs)]
137
138
139 toNgramsElement :: (ListType, (Text, (Double, Set Text))) -> [NgramsElement]
140 toNgramsElement (listType, (_stem, (_score, setNgrams))) =
141 case Set.toList setNgrams of
142 [] -> []
143 (parent:children) -> [parentElem] <> childrenElems
144 where
145 parentElem = mkNgramsElement parent
146 listType
147 Nothing
148 (mSetFromList children)
149 childrenElems = map (\t -> mkNgramsElement t listType
150 (Just $ RootParent parent parent)
151 (mSetFromList [])
152 ) children
153
154
155 toList :: (b -> Bool) -> ListType -> b -> (ListType, b)
156 toList stop l n = case stop n of
157 True -> (StopTerm, n)
158 False -> (l, n)
159
160
161 toTermList :: Int -> Int -> (a -> Bool) -> [a] -> [(ListType, a)]
162 toTermList _ _ _ [] = []
163 toTermList a b stop ns = -- trace ("computing toTermList") $
164 map (toList stop CandidateTerm) xs
165 <> map (toList stop GraphTerm) ys
166 <> toTermList a b stop zs
167 where
168 xs = take a ns
169 ta = drop a ns
170
171 ys = take b ta
172 zs = drop b ta
173
174
175 isStopTerm :: StopSize -> Text -> Bool
176 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
177 where
178 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)