@@ -39,7 +39,50 @@ export default class Groupby {
3939 this . colIndex = colIndex
4040
4141 }
42-
42+ /**
43+ * Generate group object data needed for group operations
44+ * let data = [ [ 1, 2, 3 ], [ 4, 5, 6 ], [ 20, 30, 40 ], [ 39, 89, 78 ] ];
45+ * let cols = [ "A", "B", "C" ];
46+ * let df = new dfd.DataFrame(data, { columns: cols });
47+ * let groupDf = df.groupby([ "A" ]);
48+ * The following internal object is generated and save to this.colDict
49+ * {
50+ * '1': { A: [ 1 ], B: [ 2 ], C: [ 3 ] },
51+ * '4': { A: [ 4 ], B: [ 5 ], C: [ 6 ] },
52+ * '20': { A: [ 20 ], B: [ 30 ], C: [ 40 ] },
53+ * '39': { A: [ 39 ], B: [ 89 ], C: [ 78 ] }
54+ * }
55+ * Since for groupby using more than one columns is index via '-'
56+ * e.g for df.groupby(['A','B'])
57+ * the result will look like this
58+ * {
59+ * '1-2': {A: [ 1 ], B: [ 2 ], C: [ 3 ]},
60+ * '4-5': {A: [ 4 ], B: [ 5 ], C: [ 6 ]}
61+ * }
62+ * but in doing analysis on a specific column like this
63+ * df.groupby(['A','B']).col(['C'])
64+ * will have the following set of internal result
65+ * {
66+ * '1-2': { C: [ 3 ]},
67+ * '4-5': {C: [ 6 ]}
68+ * }
69+ * In building our multindex type of DataFrame for this data,
70+ * we've somehow loose track of value for column A and B.
71+ * This could actually be generated by using split('-') on the object keys
72+ * e.g '1-2'.split('-') will give us the value for A and B.
73+ * But we might have weird case scenerio where A and B value has '-`
74+ * e.g
75+ * {
76+ * '1--2-': { C: [ 3 ]},
77+ * '4--5-': {C: [ 6 ]}
78+ * }
79+ * using `.split('-') might not work well
80+ * Hence we create a key-value `keyToValue` object to store index and their
81+ * associated value
82+ * NOTE: In the previous implementation we made use of Graph representation
83+ * for the group by data and Depth First search (DFS). But we decided to use key-value
84+ * object in javascript as an hashmap to reduce search time compared to using Grpah and DFS
85+ */
4386 group ( ) : Groupby {
4487 const self = this
4588 let keyToValue :{
@@ -78,6 +121,24 @@ export default class Groupby {
78121 return this
79122 }
80123
124+ /**
125+ * Generate new internal groupby data
126+ * group = df.groupby(['A', 'B']).col('C')
127+ * This filter the colDict property as generated by `.group()`
128+ * it filter each group to contain only column `C` in their internal object
129+ * e.g
130+ * {
131+ * '1-2': {A: [ 1 ], B: [ 2 ], C: [ 3 ]},
132+ * '4-5': {A: [ 4 ], B: [ 5 ], C: [ 6 ]}
133+ * }
134+ * to
135+ * {
136+ * '1-2': { C: [ 3 ]},
137+ * '4-5': {C: [ 6 ]}
138+ * }
139+ * @param colNames column names
140+ * @return Groupby
141+ */
81142 col ( colNames : ArrayType1D | undefined ) : Groupby {
82143
83144 if ( typeof colNames === "undefined" ) {
@@ -114,7 +175,47 @@ export default class Groupby {
114175 return gp
115176 }
116177
117- arithemetic ( operation : { [ key : string ] : Array < string > | string } | string ) : { [ key : string ] : { } } {
178+ /**
179+ * Perform all groupby arithmetic operations
180+ * In the previous implementation all groups data are
181+ * stord as DataFrame, which involve lot of memory usage
182+ * Hence each groups are just pure javascrit object
183+ * and all arithmetic operation is done directly on javascript
184+ * arrays.
185+ * e.g
186+ * using this internal data
187+ * {
188+ * '1-2': {A: [ 1,3 ], B: [ 2,5 ], C: [ 3, 5 ]},
189+ * '4-5': {A: [ 4,1 ], B: [ 5,0 ], C: [ 6, 12 ]}
190+ * }
191+ * 1) using groupby(['A', 'B']).arithmetic("mean")
192+ * result: * {
193+ * '1-2': {A_mean: [ 2 ], B_mean: [ 3.5 ], C_mean: [ 4 ]},
194+ * '4-5': {A_mean: [ 2.5 ], B: [ 2.5 ], C_mean: [ 9 ]}
195+ * }
196+ * 2) .arithmetic({
197+ * A: 'mean',
198+ * B: 'sum',
199+ * C: 'min'
200+ * })
201+ * result:
202+ * {
203+ * '1-2': {A_mean: [ 2 ], B_sum: [ 7 ], C_min: [ 3 ]},
204+ * '4-5': {A_mean: [ 2.5 ], B_sum: [ 5 ], C_min: [ 6 ]}
205+ * }
206+ * 3) .arithmetic({
207+ * A: 'mean',
208+ * B: 'sum',
209+ * C: ['min', 'max']
210+ * })
211+ * result:
212+ * {
213+ * '1-2': {A_mean: [ 2 ], B_sum: [ 7 ], C_min: [ 3 ], C_max: [5]},
214+ * '4-5': {A_mean: [ 2.5 ], B_sum: [ 5 ], C_min: [ 6 ], C_max: [12]}
215+ * }
216+ * @param operation
217+ */
218+ private arithemetic ( operation : { [ key : string ] : Array < string > | string } | string ) : { [ key : string ] : { } } {
118219
119220 const opsName = [ "mean" , "sum" , "count" , "mode" , "std" , "var" , "cumsum" , "cumprod" ,
120221 "cummax" , "cummin" , "median" , "min" ] ;
@@ -173,7 +274,12 @@ export default class Groupby {
173274 return colDict
174275 }
175276
176- groupMathLog ( colVal : Array < number > , ops : string ) : Array < number > {
277+ /**
278+ * Peform all arithmetic logic
279+ * @param colVal
280+ * @param ops
281+ */
282+ private groupMathLog ( colVal : Array < number > , ops : string ) : Array < number > {
177283 let data = [ ]
178284 switch ( ops ) {
179285 case "max" :
@@ -261,7 +367,12 @@ export default class Groupby {
261367 return data
262368 }
263369
264- toDataFrame ( colDict : { [ key : string ] : { } } ) : DataFrame {
370+ /**
371+ * Takes in internal groupby internal data and convert
372+ * them to a single data frame.
373+ * @param colDict
374+ */
375+ private toDataFrame ( colDict : { [ key : string ] : { } } ) : DataFrame {
265376 let data : { [ key : string ] : ArrayType1D } = { }
266377
267378 for ( let key of this . colKeyDict ( colDict ) ) {
@@ -287,7 +398,7 @@ export default class Groupby {
287398 return new DataFrame ( data )
288399 }
289400
290- operations ( ops : string ) : DataFrame {
401+ private operations ( ops : string ) : DataFrame {
291402 if ( ! this . groupColNames ) {
292403 let colGroup = this . col ( undefined )
293404 let colDict = colGroup . arithemetic ( ops )
0 commit comments