]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
[Merge] dev -> dev-phylo ready
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12
13 module Gargantext.Core.Text.List
14 where
15
16 -- import Data.Either (partitionEithers, Either(..))
17 import Data.Map (Map)
18 import Data.Set (Set)
19 import Data.Text (Text)
20 import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, RootParent(..), mSetFromList)
21 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
22 import Gargantext.Core (Lang(..))
23 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId, Ordering(..))
24 import Gargantext.Database.Action.Metrics.NgramsByNode ({-ngramsGroup,-} getNodesByNgramsUser, groupNodesByNgramsWith)
25 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
26 import Gargantext.Core.Text.Metrics.TFICF (sortTficf)
27 import Gargantext.Database.Prelude (Cmd)
28 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
29 import Gargantext.Prelude
30 import Gargantext.Core.Text.List.Learn (Model(..))
31 -- import Gargantext.Core.Text.Metrics (takeScored)
32 import qualified Data.Char as Char
33 import qualified Data.List as List
34 import qualified Data.Map as Map
35 import qualified Data.Set as Set
36 import qualified Data.Text as Text
37
38
39 data NgramsListBuilder = BuilderStepO { stemSize :: Int
40 , stemX :: Int
41 , stopSize :: Int
42 }
43 | BuilderStep1 { withModel :: Model }
44 | BuilderStepN { withModel :: Model }
45 | Tficf { nlb_lang :: Lang
46 , nlb_group1 :: Int
47 , nlb_group2 :: Int
48 , nlb_stopSize :: StopSize
49 , nlb_userCorpusId :: UserCorpusId
50 , nlb_masterCorpusId :: MasterCorpusId
51 }
52
53
54 data StopSize = StopSize {unStopSize :: Int}
55
56 -- | TODO improve grouping functions of Authors, Sources, Institutes..
57 buildNgramsLists :: Lang
58 -> Int
59 -> Int
60 -> StopSize
61 -> UserCorpusId
62 -> MasterCorpusId
63 -> Cmd err (Map NgramsType [NgramsElement])
64 buildNgramsLists l n m s uCid mCid = do
65 ngTerms <- buildNgramsTermsList l n m s uCid mCid
66 othersTerms <- mapM (buildNgramsOthersList uCid identity) [Authors, Sources, Institutes]
67 pure $ Map.unions $ othersTerms <> [ngTerms]
68
69
70 buildNgramsOthersList :: UserCorpusId
71 -> (Text -> Text)
72 -> NgramsType
73 -> Cmd err (Map NgramsType [NgramsElement])
74 buildNgramsOthersList uCid groupIt nt = do
75 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
76
77 let
78 listSize = 9
79 all' = List.reverse $ List.sortOn (Set.size . snd . snd) $ Map.toList ngs
80
81 graphTerms = List.take listSize all'
82 candiTerms = List.drop listSize all'
83
84 pure $ Map.unionsWith (<>) [ toElements MapTerm graphTerms
85 , toElements CandidateTerm candiTerms
86 ]
87 where
88 toElements nType x =
89 Map.fromList [(nt, [ mkNgramsElement t nType Nothing (mSetFromList [])
90 | (t,_ns) <- x
91 ]
92 )]
93
94 buildNgramsTermsList :: Lang
95 -> Int
96 -> Int
97 -> StopSize
98 -> UserCorpusId
99 -> MasterCorpusId
100 -> Cmd err (Map NgramsType [NgramsElement])
101 buildNgramsTermsList _l _n _m s uCid mCid = do
102 candidates <- sortTficf Up <$> getTficf uCid mCid NgramsTerms
103 printDebug "head candidates" (List.take 10 $ candidates)
104 printDebug "tail candidates" (List.take 10 $ List.reverse $ candidates)
105
106 let
107 (candidatesHead, candidatesTail0) = List.splitAt 3 candidates
108 (candidatesMap, candidatesTailFinal) = List.splitAt 400 candidatesTail0
109
110 termList = (map (toGargList ((isStopTerm s) . fst) CandidateTerm) candidatesHead)
111 <> (map (toGargList ((isStopTerm s) . fst) MapTerm) candidatesMap)
112 <> (map (toGargList ((isStopTerm s) . fst) CandidateTerm) candidatesTailFinal)
113
114 ngs = List.concat
115 $ map toNgramsElement
116 $ map (\(lt, (t,d)) -> (lt, ((t, (d,Set.singleton t))))) termList
117
118 pure $ Map.fromList [(NgramsTerms, ngs)]
119
120
121 toTermList :: Int
122 -> Int
123 -> (a -> Bool)
124 -> [a]
125 -> [(ListType, a)]
126 toTermList _ _ _ [] = []
127 toTermList a b stop ns = -- trace ("computing toTermList") $
128 map (toGargList stop CandidateTerm) xs
129 <> map (toGargList stop MapTerm) ys
130 <> toTermList a b stop zs
131 where
132 xs = take a ns
133 xz = drop a ns
134
135 ys = take b xz
136 zs = drop b xz
137
138
139 toNgramsElement :: (ListType, (Text, (Double, Set Text))) -> [NgramsElement]
140 toNgramsElement (listType, (_stem, (_score, setNgrams))) =
141 case Set.toList setNgrams of
142 [] -> []
143 (parent:children) -> [parentElem] <> childrenElems
144 where
145 parentElem = mkNgramsElement parent
146 listType
147 Nothing
148 (mSetFromList children)
149 childrenElems = map (\t -> mkNgramsElement t listType
150 (Just $ RootParent parent parent)
151 (mSetFromList [])
152 ) children
153
154
155 toGargList :: (b -> Bool) -> ListType -> b -> (ListType, b)
156 toGargList isStop l n = case isStop n of
157 True -> (StopTerm, n)
158 False -> (l, n)
159
160
161 isStopTerm :: StopSize -> Text -> Bool
162 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
163 where
164 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)