beam/sdks/python/apache_beam/examples/inference/huggingface_question_answering.py at master · apache/beam · GitHub

Name: beam/sdks/python/apache_beam/examples/inference/huggingface_question_answering.py at master · apache/beam · GitHub
Rating: 4.6 (2682 reviews)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

""""A pipeline that uses RunInference to perform Question Answering using the
model from Hugging Face Models Hub.

This pipeline takes questions and context from a custom text file separated by
a semicolon. These are converted to SquadExamples by using the utility provided
by transformers.QuestionAnsweringPipeline and passed to the model handler.
We just provide the model name here because the model repository specifies the
task that it will do. The pipeline then writes the prediction to an output
file in which users can then compare against the original context.
"""

importargparse
importlogging
fromcollections.abcimportIterable

importapache_beamasbeam
fromapache_beam.ml.inference.baseimportKeyedModelHandler
fromapache_beam.ml.inference.baseimportPredictionResult
fromapache_beam.ml.inference.baseimportRunInference
fromapache_beam.ml.inference.huggingface_inferenceimportHuggingFacePipelineModelHandler
fromapache_beam.ml.inference.huggingface_inferenceimportPipelineTask
fromapache_beam.options.pipeline_optionsimportPipelineOptions
fromapache_beam.options.pipeline_optionsimportSetupOptions
fromapache_beam.runners.runnerimportPipelineResult
fromtransformersimportQuestionAnsweringPipeline


classPostProcessor(beam.DoFn):
"""Processes the PredictionResult to get the predicted answer.

 Hugging Face Pipeline for Question Answering returns a dictionary
 with score, start and end index of answer and the answer.
 """
defprocess(self, result: tuple[str, PredictionResult]) ->Iterable[str]:
text, prediction=result
predicted_answer=prediction.inference['answer']
yieldtext+';'+predicted_answer


defpreprocess(text):
"""
 preprocess separates the text into question and context
 by splitting on semi-colon.

 Args:
 text (str): string with question and context separated by semi-colon.

 Yields:
 (str, str): yields question and context from text.
 """
iflen(text.strip()) >0:
question, context=text.split(';')
yield (question, context)


defcreate_squad_example(text):
"""Creates SquadExample objects to be fed to QuestionAnsweringPipeline
 supported by Hugging Face.

 Check out https://huggingface.co/docs/transformers/main_classes/pipelines#transformers.QuestionAnsweringPipeline.__call__.X #pylint: disable=line-too-long
 to learn about valid input types for QuestionAnswering Pipeline.
 Args:
 text (Tuple[str,str]): a tuple of question and context.
 """
question, context=text
yieldquestion, QuestionAnsweringPipeline.create_sample(question, context)


defparse_known_args(argv):
"""Parses args for the workflow."""
parser=argparse.ArgumentParser()
parser.add_argument(
'--input',
dest='input',
help='Path of file containing question and context separated by semicolon'
 )
parser.add_argument(
'--output',
dest='output',
required=True,
help='Path of file in which to save the output predictions.')
parser.add_argument(
'--model_name',
dest='model_name',
default="deepset/roberta-base-squad2",
help='Model repository-id from Hugging Face Models Hub.')
parser.add_argument(
'--revision',
dest='revision',
help=
'Specific model version to use - branch name, tag name, or a commit-id.')
returnparser.parse_known_args(argv)


defrun(
argv=None, save_main_session=True, test_pipeline=None) ->PipelineResult:
"""
 Args:
 argv: Command line arguments defined for this example.
 save_main_session: Used for internal testing.
 test_pipeline: Used for internal testing.
 """
known_args, pipeline_args=parse_known_args(argv)
pipeline_options=PipelineOptions(pipeline_args)
pipeline_options.view_as(SetupOptions).save_main_session=save_main_session

pipeline=test_pipeline
ifnottest_pipeline:
pipeline=beam.Pipeline(options=pipeline_options)

model_handler=HuggingFacePipelineModelHandler(
task=PipelineTask.QuestionAnswering,
model=known_args.model_name,
load_model_args={
'framework': 'pt', 'revision': known_args.revision
 })
ifnotknown_args.input:
text= (
pipeline|'CreateSentences'>>beam.Create([
"What does Apache Beam do?;"
"Apache Beam enables batch and streaming data processing.",
"What is the capital of France?;The capital of France is Paris .",
"Where was beam summit?;Apache Beam Summit 2023 was in NYC.",
 ]))
else:
text= (
pipeline|'ReadSentences'>>beam.io.ReadFromText(known_args.input))
processed_text= (
text
|'PreProcess'>>beam.ParDo(preprocess)
|'SquadExample'>>beam.ParDo(create_squad_example))
output= (
processed_text
|'RunInference'>>RunInference(KeyedModelHandler(model_handler))
|'ProcessOutput'>>beam.ParDo(PostProcessor()))
_=output|"WriteOutput">>beam.io.WriteToText(
known_args.output, shard_name_template='', append_trailing_newlines=True)

result=pipeline.run()
result.wait_until_finish()
returnresult


if__name__=='__main__':
logging.getLogger().setLevel(logging.INFO)
run()