如何从Google Cloud Function启动Cloud Dataflow作业?我想使用Google Cloud Functions作为一种机制来启用跨服务组合。
我在下面包含了一个非常基本的WordCount示例。请注意,您需要在Cloud Function部署中包含java二进制文件的副本,因为它不在默认环境中。同样,您也需要将部署jar与Cloud Function打包。
module.exports = {
wordcount: function (context, data) {
const spawn = require('child_process').spawn;
const child = spawn(
'jre1.8.0_73/bin/java',
['-cp',
'MY_JAR.jar',
'com.google.cloud.dataflow.examples.WordCount',
'--jobName=fromACloudFunction',
'--project=MY_PROJECT',
'--runner=BlockingDataflowPipelineRunner',
'--stagingLocation=gs://STAGING_LOCATION',
'--inputFile=gs://dataflow-samples/shakespeare/*',
'--output=gs://OUTPUT_LOCATION'
],
{ cwd: __dirname });
child.stdout.on('data', function(data) {
console.log('stdout: ' + data);
});
child.stderr.on('data', function(data) {
console.log('error: ' + data);
});
child.on('close', function(code) {
console.log('closing code: ' + code);
});
context.success();
}
}
您可以通过使用非阻塞运行器并让函数返回JobID来进一步增强此示例,以便您可以单独轮询作业完成。此模式也应该适用于其他SDK,只要它们的依赖项可以打包到Cloud Function中。
最好的方法是通过云函数启动,但要小心,如果您使用云函数进行google云存储,那么对于每个上传的文件,都会启动一个数据流作业。
const { google } = require('googleapis');
const templatePath = "gs://template_dir/df_template;
const project = "<project_id>";
const tempLoc = "gs://tempLocation/";
exports.PMKafka = (data, context, callback) => {
const file = data;
console.log(`Event ${context.eventId}`);
console.log(`Event Type: ${context.eventType}`);
console.log(`Bucket Name: ${file.bucket}`);
console.log(`File Name: ${file.name}`);
console.log(`Metageneration: ${file.metageneration}`);
console.log(`Created: ${file.timeCreated}`);
console.log(`Updated: ${file.updated}`);
console.log(`Uploaded File Name - gs://${file.bucket}/${file.name}`);
google.auth.getApplicationDefault(function (err, authClient, projectId) {
if (err) {
throw err;
}
if (authClient.createScopedRequired && authClient.createScopedRequired()) {
authClient = authClient.createScoped(authScope);
}
const dataflow = google.dataflow({ version: 'v1b3', auth: authClient });
var inputDict= {
inputFile: `gs://${file.bucket}/${file.name}`,
...
...
<other_runtime_parameters>
};
var env = {
tempLocation: tempLoc
};
var resource_opts = {
parameters: inputDict,
environment: env,
jobName: config.jobNamePrefix + "-" + new Date().toISOString().toLowerCase().replace(":","-").replace(".","-")
};
var opts = {
gcsPath: templatePath,
projectId: project,
resource: resource_opts
}
console.log(`Dataflow Run Time Options - ${JSON.stringify(opts)}`)
dataflow.projects.templates.launch(opts, function (err, response) {
if (err) {
console.error("problem running dataflow template, error was: ", err);
slack.publishMessage(null, null, false, err);
return;
}
console.log("Dataflow template response: ", response);
var jobid = response["data"]["job"]["id"];
console.log("Dataflow Job ID: ", jobid);
});
callback();
});
};