]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/List.hs
[REFACT] before scoring new ngrams lists.
[gargantext.git] / src / Gargantext / Text / List.hs
1 {-|
2 Module : Gargantext.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE NoImplicitPrelude #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE RankNTypes #-}
15
16 module Gargantext.Text.List
17 where
18
19 import Data.Map (Map)
20 import Data.Set (Set)
21 import Data.Text (Text)
22 import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, RootParent(..), mSetFromList)
23 import Gargantext.Core (Lang(..))
24 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
25 import Gargantext.Database.Metrics.NgramsByNode (getTficf', sortTficf, ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith)
26 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
27 import Gargantext.Database.Utils (Cmd)
28 import Gargantext.Text.List.Learn (Model(..))
29 import Gargantext.Prelude
30 --import Gargantext.Text.Terms (TermType(..))
31 import qualified Data.Char as Char
32 import qualified Data.List as List
33 import qualified Data.Map as Map
34 import qualified Data.Set as Set
35 import qualified Data.Text as Text
36
37
38 data NgramsListBuilder = BuilderStepO { stemSize :: Int
39 , stemX :: Int
40 , stopSize :: Int
41 }
42 | BuilderStep1 { withModel :: Model }
43 | BuilderStepN { withModel :: Model }
44
45
46 data StopSize = StopSize {unStopSize :: Int}
47
48 -- | TODO improve grouping functions of Authors, Sources, Institutes..
49 buildNgramsLists :: Lang -> Int -> Int -> StopSize -> UserCorpusId -> MasterCorpusId
50 -> Cmd err (Map NgramsType [NgramsElement])
51 buildNgramsLists l n m s uCid mCid = do
52 ngTerms <- buildNgramsTermsList l n m s uCid mCid
53 othersTerms <- mapM (buildNgramsOthersList uCid identity) [Authors, Sources, Institutes]
54 pure $ Map.unions $ othersTerms <> [ngTerms]
55
56
57 buildNgramsOthersList :: UserCorpusId -> (Text -> Text) -> NgramsType
58 -> Cmd err (Map NgramsType [NgramsElement])
59 buildNgramsOthersList uCid groupIt nt = do
60 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
61
62 let
63 all' = Map.toList ngs
64 pure $ (toElements GraphTerm $ take 10 all') <> (toElements CandidateTerm $ drop 10 all')
65 where
66 toElements nType x = Map.fromList [(nt, [ mkNgramsElement t nType Nothing (mSetFromList [])
67 | (t,_ns) <- x
68 ]
69 )
70 ]
71
72 buildNgramsTermsList :: Lang -> Int -> Int -> StopSize -> UserCorpusId -> MasterCorpusId
73 -> Cmd err (Map NgramsType [NgramsElement])
74 buildNgramsTermsList l n m s uCid mCid = do
75 candidates <- sortTficf <$> getTficf' uCid mCid NgramsTerms (ngramsGroup l n m)
76 let termList = toTermList ((isStopTerm s) . fst) candidates
77 let ngs = List.concat $ map toNgramsElement termList
78
79 pure $ Map.fromList [(NgramsTerms, ngs)]
80
81
82 toNgramsElement :: (ListType, (Text, (Double, Set Text))) -> [NgramsElement]
83 toNgramsElement (listType, (_stem, (_score, setNgrams))) =
84 case Set.toList setNgrams of
85 [] -> []
86 (parent:children) -> [parentElem] <> childrenElems
87 where
88 parentElem = mkNgramsElement parent
89 listType
90 Nothing
91 (mSetFromList children)
92 childrenElems = map (\t -> mkNgramsElement t listType
93 (Just $ RootParent parent parent)
94 (mSetFromList [])
95 ) children
96
97 -- TODO remove hard coded parameters
98 toTermList :: (a -> Bool) -> [a] -> [(ListType, a)]
99 toTermList stop ns = map (toTermList' stop CandidateTerm) xs
100 <> map (toTermList' stop GraphTerm) ys
101 <> map (toTermList' stop CandidateTerm) zs
102 where
103 toTermList' stop' l n = case stop' n of
104 True -> (StopTerm, n)
105 False -> (l, n)
106
107 -- TODO use % of size of list
108 -- TODO user ML
109 xs = take a ns
110 ys = take b $ drop a ns
111 zs = drop b $ drop a ns
112
113 a = 300
114 b = 350
115
116 isStopTerm :: StopSize -> Text -> Bool
117 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
118 where
119 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)