Spaces:

mamogasr
/

llm_engineering

Running

App Files Files Community

llm_engineering / week6 /items.py

mamogasr

Upload folder using huggingface_hub

5fdb69e verified 7 months ago

raw

history blame contribute delete

3.66 kB

	from typing import Optional
	from transformers import AutoTokenizer
	import re

	BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B"

	MIN_TOKENS = 150 # Any less than this, and we don't have enough useful content
	MAX_TOKENS = 160 # Truncate after this many tokens. Then after adding in prompt text, we will get to around 180 tokens

	MIN_CHARS = 300
	CEILING_CHARS = MAX_TOKENS * 7

	class Item:
	"""
	An Item is a cleaned, curated datapoint of a Product with a Price
	"""

	tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
	PREFIX = "Price is $"
	QUESTION = "How much does this cost to the nearest dollar?"
	REMOVALS = ['"Batteries Included?": "No"', '"Batteries Included?": "Yes"', '"Batteries Required?": "No"', '"Batteries Required?": "Yes"', "By Manufacturer", "Item", "Date First", "Package", ":", "Number of", "Best Sellers", "Number", "Product "]

	title: str
	price: float
	category: str
	token_count: int = 0
	details: Optional[str]
	prompt: Optional[str] = None
	include = False

	def __init__(self, data, price):
	self.title = data['title']
	self.price = price
	self.parse(data)

	def scrub_details(self):
	"""
	Clean up the details string by removing common text that doesn't add value
	"""
	details = self.details
	for remove in self.REMOVALS:
	details = details.replace(remove, "")
	return details

	def scrub(self, stuff):
	"""
	Clean up the provided text by removing unnecessary characters and whitespace
	Also remove words that are 7+ chars and contain numbers, as these are likely irrelevant product numbers
	"""
	stuff = re.sub(r'[:\[\]"{}【】\s]+', ' ', stuff).strip()
	stuff = stuff.replace(" ,", ",").replace(",,,",",").replace(",,",",")
	words = stuff.split(' ')
	select = [word for word in words if len(word)<7 or not any(char.isdigit() for char in word)]
	return " ".join(select)

	def parse(self, data):
	"""
	Parse this datapoint and if it fits within the allowed Token range,
	then set include to True
	"""
	contents = '\n'.join(data['description'])
	if contents:
	contents += '\n'
	features = '\n'.join(data['features'])
	if features:
	contents += features + '\n'
	self.details = data['details']
	if self.details:
	contents += self.scrub_details() + '\n'
	if len(contents) > MIN_CHARS:
	contents = contents[:CEILING_CHARS]
	text = f"{self.scrub(self.title)}\n{self.scrub(contents)}"
	tokens = self.tokenizer.encode(text, add_special_tokens=False)
	if len(tokens) > MIN_TOKENS:
	tokens = tokens[:MAX_TOKENS]
	text = self.tokenizer.decode(tokens)
	self.make_prompt(text)
	self.include = True

	def make_prompt(self, text):
	"""
	Set the prompt instance variable to be a prompt appropriate for training
	"""
	self.prompt = f"{self.QUESTION}\n\n{text}\n\n"
	self.prompt += f"{self.PREFIX}{str(round(self.price))}.00"
	self.token_count = len(self.tokenizer.encode(self.prompt, add_special_tokens=False))

	def test_prompt(self):
	"""
	Return a prompt suitable for testing, with the actual price removed
	"""
	return self.prompt.split(self.PREFIX)[0] + self.PREFIX

	def __repr__(self):
	"""
	Return a String version of this Item
	"""
	return f"<{self.title} = ${self.price}>"