mirror of
https://github.com/TeamWiseFlow/wiseflow.git
synced 2025-01-23 02:20:20 +08:00
add batch process
This commit is contained in:
parent
b83ca2369a
commit
3e4454a33b
@ -81,6 +81,10 @@ If the webpage text does not contain any information related to points of intere
|
||||
async def get_author_and_publish_date(self, text: str) -> tuple[str, str]:
|
||||
if not text:
|
||||
return "NA", "NA"
|
||||
|
||||
if len(text) > 1024:
|
||||
text = f'{text[:500]}......{text[-500:]}'
|
||||
|
||||
system_prompt = "As an information extraction assistant, your task is to accurately extract the source (or author) and publication date from the given webpage text. It is important to adhere to extracting the information directly from the original text. If the original text does not contain a particular piece of information, please replace it with NA"
|
||||
suffix = '''Please output the extracted information in the following JSON format:
|
||||
{"source": source or article author (use "NA" if this information cannot be extracted), "publish_date": extracted publication date (keep only the year, month, and day; use "NA" if this information cannot be extracted)}'''
|
||||
@ -106,14 +110,26 @@ If the webpage text does not contain any information related to points of intere
|
||||
async def get_more_related_urls(self, link_dict: dict) -> set[str]:
|
||||
if not link_dict:
|
||||
return set()
|
||||
|
||||
urls = set()
|
||||
content = ''
|
||||
for key, value in link_dict.items():
|
||||
content = f"{content}{key}: {value}\n"
|
||||
result = await llm([{'role': 'system', 'content': self.get_more_link_prompt}, {'role': 'user', 'content': f'{content}\n{self.get_more_link_suffix}'}],
|
||||
model=self.secondary_model, temperature=0.1)
|
||||
if len(content) > 512:
|
||||
result = await llm([{'role': 'system', 'content': self.get_more_link_prompt},
|
||||
{'role': 'user', 'content': f'{content}\n{self.get_more_link_suffix}'}],
|
||||
model=self.secondary_model, temperature=0.1)
|
||||
self.logger.debug(f'get_more_related_urls llm output:\n{result}')
|
||||
urls.update(extract_urls(result))
|
||||
content = ''
|
||||
|
||||
if content:
|
||||
result = await llm([{'role': 'system', 'content': self.get_more_link_prompt},
|
||||
{'role': 'user', 'content': f'{content}\n{self.get_more_link_suffix}'}],
|
||||
model=self.secondary_model, temperature=0.1)
|
||||
self.logger.debug(f'get_more_related_urls llm output:\n{result}')
|
||||
urls.update(extract_urls(result))
|
||||
|
||||
self.logger.debug(f'get_more_related_urls llm output:\n{result}')
|
||||
urls = extract_urls(result)
|
||||
raw_urls = list(link_dict.values())
|
||||
for url in urls:
|
||||
if url not in raw_urls:
|
||||
@ -121,9 +137,10 @@ If the webpage text does not contain any information related to points of intere
|
||||
urls.remove(url)
|
||||
return urls
|
||||
|
||||
async def get_info(self, text: str, info_pre_fix: str) -> list[dict]:
|
||||
async def get_info(self, text: str, info_pre_fix: str, link_dict: dict) -> list[dict]:
|
||||
if not text:
|
||||
return []
|
||||
|
||||
content = f'<text>\n{text}\n</text>\n\n{self.get_info_suffix}'
|
||||
result = await llm([{'role': 'system', 'content': self.get_info_prompt}, {'role': 'user', 'content': content}],
|
||||
model=self.model, temperature=0.1, response_format={"type": "json_object"})
|
||||
@ -152,6 +169,10 @@ If the webpage text does not contain any information related to points of intere
|
||||
if item['focus'] not in self.focus_dict:
|
||||
self.logger.warning(f"{item['focus']} not in focus_list, it's model's Hallucination")
|
||||
continue
|
||||
if item['content'] in link_dict:
|
||||
self.logger.debug(f"{item['content']} in link_dict, aborting")
|
||||
continue
|
||||
|
||||
judge = await llm([{'role': 'system', 'content': system},
|
||||
{'role': 'user', 'content': f'<info>\n{item["content"]}\n</info>\n\n<text>\n{text}\n</text>\n\n{suffix}'}],
|
||||
model=self.secondary_model, temperature=0.1)
|
||||
@ -189,7 +210,17 @@ If the webpage text does not contain any information related to points of intere
|
||||
publish_date = datetime.now().strftime('%Y-%m-%d')
|
||||
|
||||
related_urls = await self.get_more_related_urls(link_dict)
|
||||
|
||||
info_prefix = f"//{author} {publish_date}//"
|
||||
infos = await self.get_info(text, info_prefix)
|
||||
lines = text.split('\n')
|
||||
text = ''
|
||||
infos = []
|
||||
for line in lines:
|
||||
text = f'{text}{line}'
|
||||
if len(text) > 2048:
|
||||
infos.extend(await self.get_info(text, info_prefix, link_dict))
|
||||
text = ''
|
||||
if text:
|
||||
infos.extend(await self.get_info(text, info_prefix, link_dict))
|
||||
|
||||
return infos, related_urls, author, publish_date
|
||||
|
Loading…
Reference in New Issue
Block a user