gapply {SparkR} | R Documentation |
Groups the SparkDataFrame using the specified columns and applies the R function to each group.
gapply
## S4 method for signature 'SparkDataFrame' gapply(x, cols, func, schema) gapply(x, ...) ## S4 method for signature 'GroupedData' gapply(x, func, schema)
x |
A SparkDataFrame |
cols |
Grouping columns |
func |
A function to be applied to each group partition specified by grouping column of the SparkDataFrame. The function 'func' takes as argument a key - grouping columns and a data frame - a local R data.frame. The output of 'func' is a local R data.frame. |
schema |
The schema of the resulting SparkDataFrame after the function is applied. The schema must match to output of 'func'. It has to be defined for each output column with preferred output column name and corresponding data type. |
x |
A GroupedData |
a SparkDataFrame
gapply(SparkDataFrame) since 2.0.0
gapply(GroupedData) since 2.0.0
Other SparkDataFrame functions: $
,
$,SparkDataFrame-method
, $<-
,
$<-,SparkDataFrame-method
,
select
, select
,
select,SparkDataFrame,Column-method
,
select,SparkDataFrame,character-method
,
select,SparkDataFrame,list-method
;
SparkDataFrame-class
; [
,
[,SparkDataFrame-method
, [[
,
[[,SparkDataFrame,numericOrcharacter-method
,
subset
,
subset,SparkDataFrame-method
;
agg
, agg
, agg
,
agg,GroupedData-method
,
agg,SparkDataFrame-method
,
summarize
, summarize
,
summarize
,
summarize,GroupedData-method
,
summarize,SparkDataFrame-method
;
arrange
, arrange
,
arrange
,
arrange,SparkDataFrame,Column-method
,
arrange,SparkDataFrame,character-method
,
orderBy
, orderBy
,
orderBy
, orderBy
,
orderBy,SparkDataFrame,characterOrColumn-method
,
orderBy,WindowSpec,Column-method
,
orderBy,WindowSpec,character-method
;
as.data.frame
,
as.data.frame,SparkDataFrame-method
;
attach
,
attach,SparkDataFrame-method
;
cache
,
cache,SparkDataFrame-method
;
collect
,
collect,SparkDataFrame-method
;
colnames
, colnames
,
colnames,SparkDataFrame-method
,
colnames<-
, colnames<-
,
colnames<-,SparkDataFrame-method
,
columns
, columns
,
columns,SparkDataFrame-method
,
names
,
names,SparkDataFrame-method
,
names<-
,
names<-,SparkDataFrame-method
;
coltypes
, coltypes
,
coltypes,SparkDataFrame-method
,
coltypes<-
, coltypes<-
,
coltypes<-,SparkDataFrame,character-method
;
count
, count
,
count,Column-method
,
count,SparkDataFrame-method
,
n
, n
,
n,Column-method
, nrow
,
nrow,SparkDataFrame-method
;
createOrReplaceTempView
,
createOrReplaceTempView
,
createOrReplaceTempView,SparkDataFrame,character-method
;
dapplyCollect
, dapplyCollect
,
dapplyCollect,SparkDataFrame,function-method
;
dapply
, dapply
,
dapply,SparkDataFrame,function,structType-method
;
describe
, describe
,
describe
,
describe,SparkDataFrame,ANY-method
,
describe,SparkDataFrame,character-method
,
describe,SparkDataFrame-method
,
summary
, summary
,
summary,SparkDataFrame-method
;
dim
,
dim,SparkDataFrame-method
;
distinct
,
distinct,SparkDataFrame-method
,
unique
,
unique,SparkDataFrame-method
;
dropDuplicates
,
dropDuplicates
,
dropDuplicates,SparkDataFrame-method
;
dropna
, dropna
,
dropna,SparkDataFrame-method
,
fillna
, fillna
,
fillna,SparkDataFrame-method
,
na.omit
, na.omit
,
na.omit,SparkDataFrame-method
;
drop
, drop
,
drop
, drop,ANY-method
,
drop,SparkDataFrame-method
;
dtypes
, dtypes
,
dtypes,SparkDataFrame-method
;
except
, except
,
except,SparkDataFrame,SparkDataFrame-method
;
explain
, explain
,
explain,SparkDataFrame-method
;
filter
, filter
,
filter,SparkDataFrame,characterOrColumn-method
,
where
, where
,
where,SparkDataFrame,characterOrColumn-method
;
first
, first
,
first,SparkDataFrame-method
,
first,characterOrColumn-method
;
gapplyCollect
, gapplyCollect
,
gapplyCollect
,
gapplyCollect,GroupedData-method
,
gapplyCollect,SparkDataFrame-method
;
groupBy
, groupBy
,
groupBy,SparkDataFrame-method
,
group_by
, group_by
,
group_by,SparkDataFrame-method
;
head
,
head,SparkDataFrame-method
;
histogram
,
histogram,SparkDataFrame,characterOrColumn-method
;
insertInto
, insertInto
,
insertInto,SparkDataFrame,character-method
;
intersect
, intersect
,
intersect,SparkDataFrame,SparkDataFrame-method
;
isLocal
, isLocal
,
isLocal,SparkDataFrame-method
;
join
,
join,SparkDataFrame,SparkDataFrame-method
;
limit
, limit
,
limit,SparkDataFrame,numeric-method
;
merge
, merge
,
merge,SparkDataFrame,SparkDataFrame-method
;
mutate
, mutate
,
mutate,SparkDataFrame-method
,
transform
, transform
,
transform,SparkDataFrame-method
;
ncol
,
ncol,SparkDataFrame-method
;
persist
,
persist,SparkDataFrame,character-method
;
printSchema
, printSchema
,
printSchema,SparkDataFrame-method
;
randomSplit
, randomSplit
,
randomSplit,SparkDataFrame,numeric-method
;
rbind
, rbind
,
rbind,SparkDataFrame-method
;
registerTempTable
,
registerTempTable
,
registerTempTable,SparkDataFrame,character-method
;
rename
, rename
,
rename,SparkDataFrame-method
,
withColumnRenamed
,
withColumnRenamed
,
withColumnRenamed,SparkDataFrame,character,character-method
;
repartition
,
repartition,SparkDataFrame-method
;
sample
, sample
,
sample,SparkDataFrame,logical,numeric-method
,
sample_frac
, sample_frac
,
sample_frac,SparkDataFrame,logical,numeric-method
;
saveAsParquetFile
,
saveAsParquetFile
,
saveAsParquetFile,SparkDataFrame,character-method
,
write.parquet
, write.parquet
,
write.parquet,SparkDataFrame,character-method
;
saveAsTable
, saveAsTable
,
saveAsTable,SparkDataFrame,character-method
;
saveDF
, saveDF
,
saveDF,SparkDataFrame,character-method
,
write.df
, write.df
,
write.df
,
write.df,SparkDataFrame,character-method
;
schema
, schema
,
schema,SparkDataFrame-method
;
selectExpr
, selectExpr
,
selectExpr,SparkDataFrame,character-method
;
showDF
, showDF
,
showDF,SparkDataFrame-method
;
show
, show
,
show,Column-method
,
show,GroupedData-method
,
show,SparkDataFrame-method
,
show,WindowSpec-method
; str
,
str,SparkDataFrame-method
;
take
,
take,SparkDataFrame,numeric-method
;
union
, union
,
union,SparkDataFrame,SparkDataFrame-method
,
unionAll
, unionAll
,
unionAll,SparkDataFrame,SparkDataFrame-method
;
unpersist
,
unpersist,SparkDataFrame-method
;
withColumn
, withColumn
,
withColumn,SparkDataFrame,character,Column-method
;
with
,
with,SparkDataFrame-method
;
write.jdbc
, write.jdbc
,
write.jdbc,SparkDataFrame,character,character-method
;
write.json
, write.json
,
write.json,SparkDataFrame,character-method
;
write.orc
, write.orc
,
write.orc,SparkDataFrame,character-method
;
write.text
, write.text
,
write.text,SparkDataFrame,character-method
## Not run:
##D Computes the arithmetic mean of the second column by grouping
##D on the first and third columns. Output the grouping values and the average.
##D
##D df <- createDataFrame (
##D list(list(1L, 1, "1", 0.1), list(1L, 2, "1", 0.2), list(3L, 3, "3", 0.3)),
##D c("a", "b", "c", "d"))
##D
##D Here our output contains three columns, the key which is a combination of two
##D columns with data types integer and string and the mean which is a double.
##D schema <- structType(structField("a", "integer"), structField("c", "string"),
##D structField("avg", "double"))
##D result <- gapply(
##D df,
##D c("a", "c"),
##D function(key, x) {
##D y <- data.frame(key, mean(x$b), stringsAsFactors = FALSE)
##D }, schema)
##D
##D We can also group the data and afterwards call gapply on GroupedData.
##D For Example:
##D gdf <- group_by(df, "a", "c")
##D result <- gapply(
##D gdf,
##D function(key, x) {
##D y <- data.frame(key, mean(x$b), stringsAsFactors = FALSE)
##D }, schema)
##D collect(result)
##D
##D Result
##D ------
##D a c avg
##D 3 3 3.0
##D 1 1 1.5
##D
##D Fits linear models on iris dataset by grouping on the 'Species' column and
##D using 'Sepal_Length' as a target variable, 'Sepal_Width', 'Petal_Length'
##D and 'Petal_Width' as training features.
##D
##D df <- createDataFrame (iris)
##D schema <- structType(structField("(Intercept)", "double"),
##D structField("Sepal_Width", "double"),structField("Petal_Length", "double"),
##D structField("Petal_Width", "double"))
##D df1 <- gapply(
##D df,
##D df$"Species",
##D function(key, x) {
##D m <- suppressWarnings(lm(Sepal_Length ~
##D Sepal_Width + Petal_Length + Petal_Width, x))
##D data.frame(t(coef(m)))
##D }, schema)
##D collect(df1)
##D
##D Result
##D ---------
##D Model (Intercept) Sepal_Width Petal_Length Petal_Width
##D 1 0.699883 0.3303370 0.9455356 -0.1697527
##D 2 1.895540 0.3868576 0.9083370 -0.6792238
##D 3 2.351890 0.6548350 0.2375602 0.2521257
##D
## End(Not run)