Spaces:

cffl
/

Exploring_Intelligent_Writing_Assistance

Runtime error

App Files Files Community

Exploring_Intelligent_Writing_Assistance / apps /data_utils.py

andrewrreed

add all application files

2e4274a over 3 years ago

raw

history blame contribute delete

5.71 kB

	# ###########################################################################
	#
	# CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
	# (C) Cloudera, Inc. 2022
	# All rights reserved.
	#
	# Applicable Open Source License: Apache 2.0
	#
	# NOTE: Cloudera open source products are modular software products
	# made up of hundreds of individual components, each of which was
	# individually copyrighted. Each Cloudera open source product is a
	# collective work under U.S. Copyright Law. Your license to use the
	# collective work is as provided in your written agreement with
	# Cloudera. Used apart from the collective work, this file is
	# licensed for your use pursuant to the open source license
	# identified above.
	#
	# This code is provided to you pursuant a written agreement with
	# (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
	# this code. If you do not have a written agreement with Cloudera nor
	# with an authorized and properly licensed third party, you do not
	# have any rights to access nor to use this code.
	#
	# Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the
	# contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
	# KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
	# WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
	# IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
	# FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
	# AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
	# ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
	# OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
	# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
	# CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
	# RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
	# BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
	# DATA.
	#
	# ###########################################################################

	import os
	from typing import List
	from collections import defaultdict
	from dataclasses import dataclass

	import numpy as np


	@dataclass
	class StyleAttributeData:
	source_attribute: str
	target_attribute: str
	examples: List[str]
	cls_model_path: str
	seq2seq_model_path: str
	sbert_model_path: str = "sentence-transformers/all-MiniLM-L6-v2"
	hf_base_url: str = "https://huggingface.co/"

	def __post_init__(self):
	self._make_attribute_selection_string()
	self._make_attribute_AND_string()
	self._make_attribute_THAN_string()

	def _make_attribute_selection_string(self):
	self.attribute_selecting_string = (
	f"{self.source_attribute}-{self.target_attribute}"
	)

	def _make_attribute_AND_string(self):
	self.attribute_AND_string = (
	f"{self.source_attribute} and {self.target_attribute}"
	)

	def _make_attribute_THAN_string(self):
	self.attribute_THAN_string = (
	f"{self.source_attribute} than {self.target_attribute}"
	)

	def build_model_url(self, model_type: str):
	"""
	Build a complete HuggingFace url for the given `model_type`.

	Args:
	model_type (str): "cls", "seq2seq", "sbert"
	"""
	attr_name = f"{model_type}_model_path"
	return os.path.join(self.hf_base_url, getattr(self, attr_name))


	# instantiate data classes & collect all data class instances
	DATA_PACKET = {
	"subjective-to-neutral": StyleAttributeData(
	source_attribute="subjective",
	target_attribute="neutral",
	examples=[
	"another strikingly elegant four-door design for the bentley s3 continental came from james.",
	"the band plays an engaging and contagious rhythm known as brega pop and calypso.",
	"chemical abstracts service (cas), a prominent division of the american chemical society, is the world's leading source of chemical information.",
	"the final fight scene is with the martial arts great, master ninja sho kosugi.",
	],
	cls_model_path="cffl/bert-base-styleclassification-subjective-neutral",
	seq2seq_model_path="cffl/bart-base-styletransfer-subjective-to-neutral",
	),
	"informal-to-formal": StyleAttributeData(
	source_attribute="informal",
	target_attribute="formal",
	examples=[
	"that was funny LOL",
	"btw - ur avatar looks familiar",
	"i loooooooooooooooooooooooove going to the movies.",
	"haha, thatd be dope",
	],
	cls_model_path="cointegrated/roberta-base-formality",
	seq2seq_model_path="prithivida/informal_to_formal_styletransfer",
	),
	}


	def format_classification_results(id2label: dict, cls_result):
	"""
	Formats classification output to be plotted using Altair.

	Args:
	id2label (dict): Transformer model's label dictionary
	cls_result (List): Classification pipeline output
	"""

	labels = [v for k, v in id2label.items()]

	format_cls_result = []

	for i in range(len(labels)):
	temp = defaultdict()
	temp["type"] = labels[i].capitalize()
	temp["value"] = round(cls_result[0]["distribution"][i], 4)

	if i == 0:
	temp["percentage_start"] = 0
	temp["percentage_end"] = temp["value"]
	else:
	temp["percentage_start"] = 1 - temp["value"]
	temp["percentage_end"] = 1

	format_cls_result.append(temp)

	return format_cls_result


	def string_to_list_string(text: str):
	return np.expand_dims(np.array(text), axis=0).tolist()