1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
| import openai import mysql.connector from mysql.connector import Error from datetime import datetime, timedelta import tiktoken import argparse from dotenv import load_dotenv import os
load_dotenv()
openai.api_key = os.getenv('OPENAI_API_KEY')
print(openai.api_key)
MAX_TOKENS = 30000 BATCH_TOKEN_LIMIT = 10000
def create_connection(): connection = None try: connection = mysql.connector.connect( host=os.getenv('MYSQL_HOST'), database=os.getenv('MYSQL_DATABASE'), user=os.getenv('MYSQL_USER'), password=os.getenv('MYSQL_PASSWORD') ) except Error as e: print(f"Error: {e}") return connection
def get_news_by_date(connection, target_date): query = "SELECT id, title, content FROM news WHERE date = %s" cursor = connection.cursor(dictionary=True) cursor.execute(query, (target_date,)) news_list = cursor.fetchall() cursor.close() return news_list
def count_tokens(text, model="gpt-4"): encoding = tiktoken.encoding_for_model(model) return len(encoding.encode(text))
def summarize_content(content): print('Summarizing content...') response = openai.chat.completions.create( model="gpt-4o-2024-08-06", messages=[ { "role": "system", "content": "你是一位金融市場專家,擅長根據新聞資料進行分析並預測股市走勢。" }, { "role": "user", "content": f"請閱讀以下新聞摘要,根據相似的內容進行統整,並針對相似的內容給予股價預測可能走勢,最後分組回答:\n\n{content}" } ] )
summary_text = response.choices[0].message.content.strip() return summary_text
def summarize_in_batches(news_content): tokens = count_tokens(news_content) if tokens <= BATCH_TOKEN_LIMIT: return summarize_content(news_content)
paragraphs = news_content.split('\n\n') batch = [] batch_token_count = 0 batch_summaries = []
for paragraph in paragraphs: paragraph_tokens = count_tokens(paragraph)
if batch_token_count + paragraph_tokens > BATCH_TOKEN_LIMIT: batch_summaries.append(summarize_content("\n\n".join(batch))) batch = [] batch_token_count = 0
batch.append(paragraph) batch_token_count += paragraph_tokens
if batch: batch_summaries.append(summarize_content("\n\n".join(batch)))
combined_summary = "\n\n".join(batch_summaries)
if count_tokens(combined_summary) > MAX_TOKENS: return summarize_content(combined_summary)
return combined_summary
def insert_summary(connection, summary_text, target_date): generated_at = target_date insert_summary_query = "INSERT INTO summary (summary_text, generated_at) VALUES (%s, %s)" cursor = connection.cursor() cursor.execute(insert_summary_query, (summary_text, generated_at)) summary_id = cursor.lastrowid connection.commit() cursor.close() return summary_id
def insert_news_summary_sources(connection, summary_id, news_ids): insert_relation_query = "INSERT INTO news_summary_sources (summary_id, news_id) VALUES (%s, %s)" cursor = connection.cursor() for news_id in news_ids: cursor.execute(insert_relation_query, (summary_id, news_id)) connection.commit() cursor.close()
if __name__ == "__main__": parser = argparse.ArgumentParser(description="選擇要處理的新聞日期") parser.add_argument('--date', type=str, choices=['today', 'yesterday'], default='yesterday', help="選擇處理當日新聞或前日新聞 (today, yesterday)")
args = parser.parse_args()
if args.date == 'today': target_date = datetime.today().strftime('%Y-%m-%d %H:%M:%S') else: target_date = (datetime.today() - timedelta(days=1)).strftime('%Y-%m-%d 00:00:00')
connection = create_connection()
if connection.is_connected(): news = get_news_by_date(connection, target_date.split(" ")[0])
if news: news_content = "\n\n".join([f"新聞 ID: {n['id']}\n標題: {n['title']}\n內容: {n['content']}" for n in news])
final_summary = summarize_in_batches(news_content)
if final_summary: summary_id = insert_summary(connection, final_summary, target_date)
news_ids = [news_item['id'] for news_item in news]
insert_news_summary_sources(connection, summary_id, news_ids)
print(f"{target_date.split(' ')[0]} 的新聞摘要已成功統整並插入資料庫。") else: print(f"無法生成新聞摘要統整內容。") else: print(f"{target_date.split(' ')[0]} 沒有新聞資料。")
connection.close()
|