]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
[Install] fix debian & openStack adding ubuntu file for tests
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12
13 module Gargantext.Core.Text.List
14 where
15
16 -- import Data.Either (partitionEithers, Either(..))
17 import Data.Map (Map)
18 import Data.Set (Set)
19 import Data.Text (Text)
20 import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, RootParent(..), mSetFromList)
21 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
22 import Gargantext.Core (Lang(..))
23 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId, Ordering(..))
24 import Gargantext.Database.Action.Metrics.NgramsByNode (ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith)
25 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
26 import Gargantext.Core.Text.Metrics.TFICF (sortTficf)
27 import Gargantext.Database.Prelude (Cmd)
28 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
29 import Gargantext.Prelude
30 import Gargantext.Core.Text.List.Learn (Model(..))
31 -- import Gargantext.Core.Text.Metrics (takeScored)
32 import qualified Data.Char as Char
33 import qualified Data.List as List
34 import qualified Data.Map as Map
35 import qualified Data.Set as Set
36 import qualified Data.Text as Text
37
38
39 data NgramsListBuilder = BuilderStepO { stemSize :: Int
40 , stemX :: Int
41 , stopSize :: Int
42 }
43 | BuilderStep1 { withModel :: Model }
44 | BuilderStepN { withModel :: Model }
45 | Tficf { nlb_lang :: Lang
46 , nlb_group1 :: Int
47 , nlb_group2 :: Int
48 , nlb_stopSize :: StopSize
49 , nlb_userCorpusId :: UserCorpusId
50 , nlb_masterCorpusId :: MasterCorpusId
51 }
52
53
54 data StopSize = StopSize {unStopSize :: Int}
55
56 -- | TODO improve grouping functions of Authors, Sources, Institutes..
57 buildNgramsLists :: Lang
58 -> Int
59 -> Int
60 -> StopSize
61 -> UserCorpusId
62 -> MasterCorpusId
63 -> Cmd err (Map NgramsType [NgramsElement])
64 buildNgramsLists l n m s uCid mCid = do
65 ngTerms <- buildNgramsTermsList l n m s uCid mCid
66 othersTerms <- mapM (buildNgramsOthersList uCid identity) [Authors, Sources, Institutes]
67 pure $ Map.unions $ othersTerms <> [ngTerms]
68
69
70 buildNgramsOthersList :: UserCorpusId
71 -> (Text -> Text)
72 -> NgramsType
73 -> Cmd err (Map NgramsType [NgramsElement])
74 buildNgramsOthersList uCid groupIt nt = do
75 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
76
77 let
78 listSize = 9
79 all' = List.reverse $ List.sortOn (Set.size . snd . snd) $ Map.toList ngs
80
81 graphTerms = List.take listSize all'
82 candiTerms = List.drop listSize all'
83
84 pure $ Map.unionsWith (<>) [ toElements MapTerm graphTerms
85 , toElements CandidateTerm candiTerms
86 ]
87 where
88 toElements nType x =
89 Map.fromList [(nt, [ mkNgramsElement t nType Nothing (mSetFromList [])
90 | (t,_ns) <- x
91 ]
92 )]
93
94 buildNgramsTermsList :: Lang
95 -> Int
96 -> Int
97 -> StopSize
98 -> UserCorpusId
99 -> MasterCorpusId
100 -> Cmd err (Map NgramsType [NgramsElement])
101 buildNgramsTermsList l n m s uCid mCid = do
102 candidates <- sortTficf Up <$> getTficf uCid mCid NgramsTerms
103 -- printDebug "head candidates" (List.take 10 $ candidates)
104 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ candidates)
105
106 let
107 (candidatesHead, candidatesTail0) = List.splitAt 3 candidates
108 (candidatesMap, candidatesTailFinal) = List.splitAt 400 candidatesTail0
109
110 termList = (map (toGargList ((isStopTerm s) . fst) CandidateTerm) candidatesHead)
111 <> (map (toGargList ((isStopTerm s) . fst) MapTerm) candidatesMap)
112 <> (map (toGargList ((isStopTerm s) . fst) CandidateTerm) candidatesTailFinal)
113
114 ngs = List.concat
115 $ map toNgramsElement
116 $ groupStems
117 $ map (\(listType, (t,d)) -> ( ngramsGroup l n m t
118 , GroupedText listType t d Set.empty
119 )
120 ) termList
121
122 pure $ Map.fromList [(NgramsTerms, ngs)]
123
124 type Group = Lang -> Int -> Int -> Text -> Text
125 type Stem = Text
126 type Label = Text
127 data GroupedText = GroupedText { _gt_listType :: ListType
128 , _gt_label :: Label
129 , _gt_score :: Double
130 , _gt_group :: Set Text
131 }
132 groupStems :: [(Stem, GroupedText)] -> [GroupedText]
133 groupStems = Map.elems . Map.fromListWith grouping
134 where
135 grouping (GroupedText lt1 label1 score1 group1)
136 (GroupedText lt2 label2 score2 group2)
137 | score1 >= score2 = GroupedText lt label1 score1 (Set.insert label2 gr)
138 | otherwise = GroupedText lt label2 score2 (Set.insert label1 gr)
139 where
140 lt = lt1 <> lt2
141 gr = Set.union group1 group2
142
143 toNgramsElement :: GroupedText -> [NgramsElement]
144 toNgramsElement (GroupedText listType label _ setNgrams) =
145 [parentElem] <> childrenElems
146 where
147 parent = label
148 children = Set.toList setNgrams
149 parentElem = mkNgramsElement parent
150 listType
151 Nothing
152 (mSetFromList children)
153 childrenElems = map (\t -> mkNgramsElement t listType
154 (Just $ RootParent parent parent)
155 (mSetFromList [])
156 ) children
157
158
159 toGargList :: (b -> Bool) -> ListType -> b -> (ListType, b)
160 toGargList isStop l n = case isStop n of
161 True -> (StopTerm, n)
162 False -> (l, n)
163
164
165 isStopTerm :: StopSize -> Text -> Bool
166 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
167 where
168 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)