Spaces:
openfree
/
Running on CPU Upgrade

openfree commited on
Commit
e42012b
ยท
verified ยท
1 Parent(s): 9203871

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +125 -19
app.py CHANGED
@@ -5,6 +5,11 @@ import os
5
  from datetime import datetime, timedelta
6
  from huggingface_hub import InferenceClient
7
 
 
 
 
 
 
8
  MAX_COUNTRY_RESULTS = 100 # ๊ตญ๊ฐ€๋ณ„ ์ตœ๋Œ€ ๊ฒฐ๊ณผ ์ˆ˜
9
  MAX_GLOBAL_RESULTS = 1000 # ์ „์„ธ๊ณ„ ์ตœ๋Œ€ ๊ฒฐ๊ณผ ์ˆ˜
10
 
@@ -311,6 +316,9 @@ def serphouse_search(query, country):
311
  return format_results_from_raw(response_data)
312
 
313
 
 
 
 
314
  # Hacker News API ๊ด€๋ จ ํ•จ์ˆ˜๋“ค ๋จผ์ € ์ถ”๊ฐ€
315
  def get_hn_item(item_id):
316
  """๊ฐœ๋ณ„ ์•„์ดํ…œ ์ •๋ณด ๊ฐ€์ ธ์˜ค๊ธฐ"""
@@ -351,33 +359,134 @@ def format_hn_time(timestamp):
351
  except:
352
  return "Unknown time"
353
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
354
  def refresh_hn_stories():
355
- """Hacker News ์Šคํ† ๋ฆฌ ์ƒˆ๋กœ๊ณ ์นจ"""
356
  status_msg = "Hacker News ํฌ์ŠคํŠธ๋ฅผ ๊ฐ€์ ธ์˜ค๋Š” ์ค‘..."
357
-
358
  outputs = [gr.update(value=status_msg, visible=True)]
359
 
360
- # ๋ชจ๋“  ์ปดํฌ๋„ŒํŠธ ์ดˆ๊ธฐํ™”
361
- for _ in hn_article_components:
362
  outputs.extend([
363
  gr.update(visible=False),
364
  gr.update(),
365
  gr.update()
366
  ])
367
 
 
 
368
  # ์ตœ์‹  ์Šคํ† ๋ฆฌ ๊ฐ€์ ธ์˜ค๊ธฐ
369
  stories = get_recent_stories()
370
 
371
- # ๊ฒฐ๊ณผ ์—…๋ฐ์ดํŠธ
372
- outputs = [gr.update(value=f"์ด {len(stories)}๊ฐœ์˜ ํฌ์ŠคํŠธ๋ฅผ ์ฐพ์•˜์Šต๋‹ˆ๋‹ค.", visible=True)]
 
 
 
 
 
 
 
 
 
 
 
 
 
373
 
374
  for idx, comp in enumerate(hn_article_components):
375
- if idx < len(stories):
376
- story = stories[idx]
377
  outputs.extend([
378
  gr.update(visible=True),
379
  gr.update(value=f"### [{story.get('title', 'Untitled')}]({story.get('url', '#')})"),
380
- gr.update(value=f"**์ž‘์„ฑ์ž:** {story.get('by', 'unknown')} | **์‹œ๊ฐ„:** {format_hn_time(story.get('time', 0))} | **์ ์ˆ˜:** {story.get('score', 0)} | **๋Œ“๊ธ€:** {len(story.get('kids', []))}๊ฐœ")
 
 
 
 
 
 
381
  ])
382
  else:
383
  outputs.extend([
@@ -386,8 +495,7 @@ def refresh_hn_stories():
386
  gr.update()
387
  ])
388
 
389
- return outputs
390
-
391
 
392
  css = """
393
  footer {visibility: hidden;}
@@ -515,23 +623,24 @@ with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css, title="NewsAI ์„œ๋น„์Šค") as
515
  'index': i,
516
  })
517
 
 
518
  # AI ๋ฆฌํฌํ„ฐ ํƒญ
519
  with gr.Tab("AI ๋ฆฌํฌํ„ฐ"):
520
- gr.Markdown("์ง€๋‚œ 24์‹œ๊ฐ„ ๋™์•ˆ์˜ Hacker News ํฌ์ŠคํŠธ๋ฅผ ๋ณด์—ฌ์ค๋‹ˆ๋‹ค.")
521
-
522
  with gr.Column():
523
  refresh_button = gr.Button("์ƒˆ๋กœ๊ณ ์นจ", variant="primary")
524
  status_message_hn = gr.Markdown("")
525
 
526
  with gr.Column(elem_id="hn_results_area"):
527
  hn_articles_state = gr.State([])
528
-
529
  hn_article_components = []
530
- for i in range(100):
531
  with gr.Group(visible=False) as article_group:
532
  title = gr.Markdown()
533
  info = gr.Markdown()
534
-
535
  hn_article_components.append({
536
  'group': article_group,
537
  'title': title,
@@ -541,9 +650,6 @@ with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css, title="NewsAI ์„œ๋น„์Šค") as
541
 
542
 
543
 
544
-
545
-
546
-
547
 
548
  # ๊ธฐ์กด ํ•จ์ˆ˜๋“ค
549
  def search_and_display(query, country, articles_state, progress=gr.Progress()):
 
5
  from datetime import datetime, timedelta
6
  from huggingface_hub import InferenceClient
7
 
8
+ from bs4 import BeautifulSoup
9
+ import concurrent.futures
10
+ import time
11
+ import re
12
+
13
  MAX_COUNTRY_RESULTS = 100 # ๊ตญ๊ฐ€๋ณ„ ์ตœ๋Œ€ ๊ฒฐ๊ณผ ์ˆ˜
14
  MAX_GLOBAL_RESULTS = 1000 # ์ „์„ธ๊ณ„ ์ตœ๋Œ€ ๊ฒฐ๊ณผ ์ˆ˜
15
 
 
316
  return format_results_from_raw(response_data)
317
 
318
 
319
+
320
+
321
+
322
  # Hacker News API ๊ด€๋ จ ํ•จ์ˆ˜๋“ค ๋จผ์ € ์ถ”๊ฐ€
323
  def get_hn_item(item_id):
324
  """๊ฐœ๋ณ„ ์•„์ดํ…œ ์ •๋ณด ๊ฐ€์ ธ์˜ค๊ธฐ"""
 
359
  except:
360
  return "Unknown time"
361
 
362
+
363
+
364
+
365
+
366
+
367
+
368
+ def clean_text(text):
369
+ """HTML ํƒœ๊ทธ ์ œ๊ฑฐ ๋ฐ ํ…์ŠคํŠธ ์ •๋ฆฌ"""
370
+ text = re.sub(r'\s+', ' ', text)
371
+ text = re.sub(r'<[^>]+>', '', text)
372
+ return text.strip()
373
+
374
+ def get_article_content(url):
375
+ """URL์—์„œ ๊ธฐ์‚ฌ ๋‚ด์šฉ ์Šคํฌ๋ž˜ํ•‘"""
376
+ if not url or 'github.com' in url or 'twitter.com' in url:
377
+ return None
378
+
379
+ try:
380
+ headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
381
+ response = requests.get(url, headers=headers, timeout=10)
382
+ soup = BeautifulSoup(response.text, 'html.parser')
383
+
384
+ # ๋ถˆํ•„์š”ํ•œ ์š”์†Œ ์ œ๊ฑฐ
385
+ for tag in soup(['script', 'style', 'nav', 'footer', 'header']):
386
+ tag.decompose()
387
+
388
+ paragraphs = soup.find_all('p')
389
+ text = ' '.join(p.get_text() for p in paragraphs)
390
+ text = clean_text(text)
391
+
392
+ return text[:4000] # ํ…์ŠคํŠธ ๊ธธ์ด ์ œํ•œ
393
+ except Exception as e:
394
+ print(f"Scraping error for {url}: {str(e)}")
395
+ return None
396
+
397
+ def generate_summary(text):
398
+ """CohereForAI ๋ชจ๋ธ์„ ์‚ฌ์šฉํ•œ ์š”์•ฝ ์ƒ์„ฑ"""
399
+ if not text:
400
+ return None
401
+
402
+ prompt = """Please analyze and summarize the following text in 2-3 sentences.
403
+ Focus on the main points and key information:
404
+
405
+ Text: {text}
406
+
407
+ Summary:"""
408
+
409
+ try:
410
+ response = hf_client.text_generation(
411
+ prompt.format(text=text),
412
+ max_new_tokens=150,
413
+ temperature=0.7,
414
+ repetition_penalty=1.2
415
+ )
416
+ return response
417
+ except Exception as e:
418
+ print(f"Summary generation error: {str(e)}")
419
+ return None
420
+
421
+ def process_hn_story(story, progress=None):
422
+ """๊ฐœ๋ณ„ ์Šคํ† ๋ฆฌ ์ฒ˜๋ฆฌ ๋ฐ ์š”์•ฝ"""
423
+ try:
424
+ url = story.get('url')
425
+ if not url:
426
+ return story, None
427
+
428
+ content = get_article_content(url)
429
+ if not content:
430
+ return story, None
431
+
432
+ summary_en = generate_summary(content)
433
+ if not summary_en:
434
+ return story, None
435
+
436
+ summary_ko = translate_to_korean(summary_en)
437
+ return story, summary_ko
438
+
439
+ except Exception as e:
440
+ print(f"Story processing error: {str(e)}")
441
+ return story, None
442
+
443
  def refresh_hn_stories():
444
+ """Hacker News ์Šคํ† ๋ฆฌ ์ƒˆ๋กœ๊ณ ์นจ (๊ฐœ์„ ๋œ ๋ฒ„์ „)"""
445
  status_msg = "Hacker News ํฌ์ŠคํŠธ๋ฅผ ๊ฐ€์ ธ์˜ค๋Š” ์ค‘..."
 
446
  outputs = [gr.update(value=status_msg, visible=True)]
447
 
448
+ # ์ปดํฌ๋„ŒํŠธ ์ดˆ๊ธฐํ™”
449
+ for comp in hn_article_components:
450
  outputs.extend([
451
  gr.update(visible=False),
452
  gr.update(),
453
  gr.update()
454
  ])
455
 
456
+ yield outputs
457
+
458
  # ์ตœ์‹  ์Šคํ† ๋ฆฌ ๊ฐ€์ ธ์˜ค๊ธฐ
459
  stories = get_recent_stories()
460
 
461
+ # ๋ณ‘๋ ฌ ์ฒ˜๋ฆฌ๋กœ ์š”์•ฝ ์ƒ์„ฑ
462
+ processed_stories = []
463
+ with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
464
+ future_to_story = {executor.submit(process_hn_story, story): story
465
+ for story in stories[:20]} # ์ƒ์œ„ 20๊ฐœ๋งŒ ์ฒ˜๋ฆฌ
466
+
467
+ for future in concurrent.futures.as_completed(future_to_story):
468
+ story, summary = future.result()
469
+ if summary:
470
+ processed_stories.append((story, summary))
471
+
472
+ # ๊ฒฐ๊ณผ ์ •๋ ฌ ๋ฐ ์ถœ๋ ฅ
473
+ processed_stories.sort(key=lambda x: x[0].get('time', 0), reverse=True)
474
+
475
+ outputs = [gr.update(value=f"์ด {len(processed_stories)}๊ฐœ์˜ ํฌ์ŠคํŠธ๊ฐ€ ์ฒ˜๋ฆฌ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.", visible=True)]
476
 
477
  for idx, comp in enumerate(hn_article_components):
478
+ if idx < len(processed_stories):
479
+ story, summary = processed_stories[idx]
480
  outputs.extend([
481
  gr.update(visible=True),
482
  gr.update(value=f"### [{story.get('title', 'Untitled')}]({story.get('url', '#')})"),
483
+ gr.update(value=f"""
484
+ **์ž‘์„ฑ์ž:** {story.get('by', 'unknown')} |
485
+ **์‹œ๊ฐ„:** {format_hn_time(story.get('time', 0))} |
486
+ **์ ์ˆ˜:** {story.get('score', 0)} |
487
+ **๋Œ“๊ธ€:** {len(story.get('kids', []))}๊ฐœ\n
488
+ **AI ์š”์•ฝ:** {summary}
489
+ """)
490
  ])
491
  else:
492
  outputs.extend([
 
495
  gr.update()
496
  ])
497
 
498
+ yield outputs
 
499
 
500
  css = """
501
  footer {visibility: hidden;}
 
623
  'index': i,
624
  })
625
 
626
+
627
  # AI ๋ฆฌํฌํ„ฐ ํƒญ
628
  with gr.Tab("AI ๋ฆฌํฌํ„ฐ"):
629
+ gr.Markdown("์ง€๋‚œ 24์‹œ๊ฐ„ ๋™์•ˆ์˜ Hacker News ํฌ์ŠคํŠธ๋ฅผ AI๊ฐ€ ์š”์•ฝํ•˜์—ฌ ๋ณด์—ฌ์ค๋‹ˆ๋‹ค.")
630
+
631
  with gr.Column():
632
  refresh_button = gr.Button("์ƒˆ๋กœ๊ณ ์นจ", variant="primary")
633
  status_message_hn = gr.Markdown("")
634
 
635
  with gr.Column(elem_id="hn_results_area"):
636
  hn_articles_state = gr.State([])
637
+
638
  hn_article_components = []
639
+ for i in range(100): # ์ƒ์œ„ 20๊ฐœ ํฌ์ŠคํŠธ๋งŒ ์ฒ˜๋ฆฌ
640
  with gr.Group(visible=False) as article_group:
641
  title = gr.Markdown()
642
  info = gr.Markdown()
643
+
644
  hn_article_components.append({
645
  'group': article_group,
646
  'title': title,
 
650
 
651
 
652
 
 
 
 
653
 
654
  # ๊ธฐ์กด ํ•จ์ˆ˜๋“ค
655
  def search_and_display(query, country, articles_state, progress=gr.Progress()):