mirror of
https://github.com/TeamWiseFlow/wiseflow.git
synced 2025-01-23 10:50:25 +08:00
add batch process
This commit is contained in:
parent
b83ca2369a
commit
3e4454a33b
@ -81,6 +81,10 @@ If the webpage text does not contain any information related to points of intere
|
|||||||
async def get_author_and_publish_date(self, text: str) -> tuple[str, str]:
|
async def get_author_and_publish_date(self, text: str) -> tuple[str, str]:
|
||||||
if not text:
|
if not text:
|
||||||
return "NA", "NA"
|
return "NA", "NA"
|
||||||
|
|
||||||
|
if len(text) > 1024:
|
||||||
|
text = f'{text[:500]}......{text[-500:]}'
|
||||||
|
|
||||||
system_prompt = "As an information extraction assistant, your task is to accurately extract the source (or author) and publication date from the given webpage text. It is important to adhere to extracting the information directly from the original text. If the original text does not contain a particular piece of information, please replace it with NA"
|
system_prompt = "As an information extraction assistant, your task is to accurately extract the source (or author) and publication date from the given webpage text. It is important to adhere to extracting the information directly from the original text. If the original text does not contain a particular piece of information, please replace it with NA"
|
||||||
suffix = '''Please output the extracted information in the following JSON format:
|
suffix = '''Please output the extracted information in the following JSON format:
|
||||||
{"source": source or article author (use "NA" if this information cannot be extracted), "publish_date": extracted publication date (keep only the year, month, and day; use "NA" if this information cannot be extracted)}'''
|
{"source": source or article author (use "NA" if this information cannot be extracted), "publish_date": extracted publication date (keep only the year, month, and day; use "NA" if this information cannot be extracted)}'''
|
||||||
@ -106,14 +110,26 @@ If the webpage text does not contain any information related to points of intere
|
|||||||
async def get_more_related_urls(self, link_dict: dict) -> set[str]:
|
async def get_more_related_urls(self, link_dict: dict) -> set[str]:
|
||||||
if not link_dict:
|
if not link_dict:
|
||||||
return set()
|
return set()
|
||||||
|
|
||||||
|
urls = set()
|
||||||
content = ''
|
content = ''
|
||||||
for key, value in link_dict.items():
|
for key, value in link_dict.items():
|
||||||
content = f"{content}{key}: {value}\n"
|
content = f"{content}{key}: {value}\n"
|
||||||
result = await llm([{'role': 'system', 'content': self.get_more_link_prompt}, {'role': 'user', 'content': f'{content}\n{self.get_more_link_suffix}'}],
|
if len(content) > 512:
|
||||||
|
result = await llm([{'role': 'system', 'content': self.get_more_link_prompt},
|
||||||
|
{'role': 'user', 'content': f'{content}\n{self.get_more_link_suffix}'}],
|
||||||
model=self.secondary_model, temperature=0.1)
|
model=self.secondary_model, temperature=0.1)
|
||||||
|
|
||||||
self.logger.debug(f'get_more_related_urls llm output:\n{result}')
|
self.logger.debug(f'get_more_related_urls llm output:\n{result}')
|
||||||
urls = extract_urls(result)
|
urls.update(extract_urls(result))
|
||||||
|
content = ''
|
||||||
|
|
||||||
|
if content:
|
||||||
|
result = await llm([{'role': 'system', 'content': self.get_more_link_prompt},
|
||||||
|
{'role': 'user', 'content': f'{content}\n{self.get_more_link_suffix}'}],
|
||||||
|
model=self.secondary_model, temperature=0.1)
|
||||||
|
self.logger.debug(f'get_more_related_urls llm output:\n{result}')
|
||||||
|
urls.update(extract_urls(result))
|
||||||
|
|
||||||
raw_urls = list(link_dict.values())
|
raw_urls = list(link_dict.values())
|
||||||
for url in urls:
|
for url in urls:
|
||||||
if url not in raw_urls:
|
if url not in raw_urls:
|
||||||
@ -121,9 +137,10 @@ If the webpage text does not contain any information related to points of intere
|
|||||||
urls.remove(url)
|
urls.remove(url)
|
||||||
return urls
|
return urls
|
||||||
|
|
||||||
async def get_info(self, text: str, info_pre_fix: str) -> list[dict]:
|
async def get_info(self, text: str, info_pre_fix: str, link_dict: dict) -> list[dict]:
|
||||||
if not text:
|
if not text:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
content = f'<text>\n{text}\n</text>\n\n{self.get_info_suffix}'
|
content = f'<text>\n{text}\n</text>\n\n{self.get_info_suffix}'
|
||||||
result = await llm([{'role': 'system', 'content': self.get_info_prompt}, {'role': 'user', 'content': content}],
|
result = await llm([{'role': 'system', 'content': self.get_info_prompt}, {'role': 'user', 'content': content}],
|
||||||
model=self.model, temperature=0.1, response_format={"type": "json_object"})
|
model=self.model, temperature=0.1, response_format={"type": "json_object"})
|
||||||
@ -152,6 +169,10 @@ If the webpage text does not contain any information related to points of intere
|
|||||||
if item['focus'] not in self.focus_dict:
|
if item['focus'] not in self.focus_dict:
|
||||||
self.logger.warning(f"{item['focus']} not in focus_list, it's model's Hallucination")
|
self.logger.warning(f"{item['focus']} not in focus_list, it's model's Hallucination")
|
||||||
continue
|
continue
|
||||||
|
if item['content'] in link_dict:
|
||||||
|
self.logger.debug(f"{item['content']} in link_dict, aborting")
|
||||||
|
continue
|
||||||
|
|
||||||
judge = await llm([{'role': 'system', 'content': system},
|
judge = await llm([{'role': 'system', 'content': system},
|
||||||
{'role': 'user', 'content': f'<info>\n{item["content"]}\n</info>\n\n<text>\n{text}\n</text>\n\n{suffix}'}],
|
{'role': 'user', 'content': f'<info>\n{item["content"]}\n</info>\n\n<text>\n{text}\n</text>\n\n{suffix}'}],
|
||||||
model=self.secondary_model, temperature=0.1)
|
model=self.secondary_model, temperature=0.1)
|
||||||
@ -189,7 +210,17 @@ If the webpage text does not contain any information related to points of intere
|
|||||||
publish_date = datetime.now().strftime('%Y-%m-%d')
|
publish_date = datetime.now().strftime('%Y-%m-%d')
|
||||||
|
|
||||||
related_urls = await self.get_more_related_urls(link_dict)
|
related_urls = await self.get_more_related_urls(link_dict)
|
||||||
|
|
||||||
info_prefix = f"//{author} {publish_date}//"
|
info_prefix = f"//{author} {publish_date}//"
|
||||||
infos = await self.get_info(text, info_prefix)
|
lines = text.split('\n')
|
||||||
|
text = ''
|
||||||
|
infos = []
|
||||||
|
for line in lines:
|
||||||
|
text = f'{text}{line}'
|
||||||
|
if len(text) > 2048:
|
||||||
|
infos.extend(await self.get_info(text, info_prefix, link_dict))
|
||||||
|
text = ''
|
||||||
|
if text:
|
||||||
|
infos.extend(await self.get_info(text, info_prefix, link_dict))
|
||||||
|
|
||||||
return infos, related_urls, author, publish_date
|
return infos, related_urls, author, publish_date
|
||||||
|
Loading…
Reference in New Issue
Block a user