Upload

2025-05-15 07:23:45 +00:00 · 2025-05-15 07:23:45 +00:00 · db703d6ed6
commit db703d6ed6
parent c3e8f84e3c
1 changed files with 207 additions and 0 deletions
--- a/max/max-scraper.py
+++ b/max/max-scraper.py
@ -0,0 +1,207 @@
+import os
+import time
+import random
+import string
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.chrome.service import Service
+from webdriver_manager.chrome import ChromeDriverManager
+
+SEEN_LINKS_FILE = "seen_links.txt"
+
+GENRE_URLS = [
+    "https://play.max.com/genre/action",
+    "https://play.max.com/genre/comedy",
+    "https://play.max.com/genre/drama",
+    "https://play.max.com/genre/horror",
+    "https://play.max.com/genre/sci-fi",
+    "https://play.max.com/genre/documentary"
+]
+
+def setup_browser():
+    options = Options()
+    options.add_argument("--disable-gpu")
+    options.add_argument("--window-size=1920,1080")
+    options.add_argument("--disable-blink-features=AutomationControlled")
+    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
+
+def scroll_to_bottom(driver, max_wait=60):
+    print("\U0001f4dc Scrolling to load all movies...")
+    last_height = driver.execute_script("return document.body.scrollHeight")
+    start_time = time.time()
+
+    while True:
+        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+        time.sleep(2.5)
+        new_height = driver.execute_script("return document.body.scrollHeight")
+        if new_height == last_height:
+            print("\u2705 Reached bottom of page.")
+            break
+        last_height = new_height
+        if time.time() - start_time > max_wait:
+            print("\u23f1\ufe0f Scroll timed out.")
+            break
+
+def extract_movie_links(driver):
+    anchors = driver.find_elements(By.XPATH, '//a[contains(@href, "/movie/")]')
+    links = list(set([a.get_attribute("href") for a in anchors if "/movie/" in a.get_attribute("href")]))
+    return links
+
+def generate_random_query():
+    letters = string.ascii_lowercase
+    return ''.join(random.choices(letters, k=random.choice([1, 2])))
+
+def load_seen_links():
+    if not os.path.exists(SEEN_LINKS_FILE):
+        return set()
+    with open(SEEN_LINKS_FILE, "r") as f:
+        return set([line.strip() for line in f if line.strip()])
+
+def save_seen_links(new_links):
+    with open(SEEN_LINKS_FILE, "a") as f:
+        for link in new_links:
+            f.write(link + "\n")
+
+def save_links(links, batch_id, prefix):
+    output_dir = "output_movies"
+    os.makedirs(output_dir, exist_ok=True)
+    for i in range(0, len(links), 10):
+        chunk = links[i:i+10]
+        file_path = os.path.join(output_dir, f"{prefix}_{batch_id}_{(i // 10) + 1}.txt")
+        with open(file_path, "w") as f:
+            for link in chunk:
+                f.write(link + "\n")
+        print(f"\U0001f4be Saved {len(chunk)} links to {file_path}")
+
+def run_movies_mode(driver, seen_links, batch_counter):
+    driver.get("https://play.max.com/movies")
+    time.sleep(4)
+    scroll_to_bottom(driver)
+    all_links = extract_movie_links(driver)
+    print(f"\u2705 Loaded {len(all_links)} movie links.")
+
+    while True:
+        try:
+            count = int(input("\U0001f522 How many random movies do you want to extract? "))
+            break
+        except ValueError:
+            print("\u274c Invalid number.")
+
+    sample = random.sample(all_links, min(count, len(all_links)))
+    unique_links = [link for link in sample if link not in seen_links]
+    duplicates = len(sample) - len(unique_links)
+
+    if unique_links:
+        save_links(unique_links, batch_counter, "movies")
+        save_seen_links(unique_links)
+        print(f"\u2705 Saved {len(unique_links)} new links. \u267b Skipped {duplicates} duplicates.")
+    else:
+        print("\u26a0\ufe0f All selected links were duplicates.")
+
+def run_search_mode(driver, seen_links, search_counter):
+    search_terms = [
+        "the", "man", "love", "dark", "moon", "fire", "red", "blue", "night",
+        "girl", "life", "death", "dream", "war", "blood", "star", "light", "king", "queen"
+    ]
+    max_attempts = 10
+    found_total = 0
+
+    for attempt in range(max_attempts):
+        query = random.choice(search_terms)
+        url = f"https://play.max.com/search?q={query}"
+        print(f"\n\U0001f50d Searching for query: '{query}'")
+        driver.get(url)
+        time.sleep(4)
+
+        links = extract_movie_links(driver)
+        unique_links = [link for link in links if link not in seen_links]
+        duplicates = len(links) - len(unique_links)
+
+        if unique_links:
+            save_links(unique_links, search_counter, "search")
+            save_seen_links(unique_links)
+            print(f"\u2705 Found {len(unique_links)} new links. \u267b Skipped {duplicates} duplicates.")
+            search_counter += 1
+            found_total += len(unique_links)
+        else:
+            print(f"\u26a0\ufe0f No new links found for query '{query}'.")
+
+        print("\u23f3 Waiting 10 seconds before next search...")
+        time.sleep(10)
+
+    print(f"\n\ud83d\udcca Search complete. Found total {found_total} new movie links in {max_attempts} searches.")
+
+def run_genre_mode(driver, seen_links, genre_batch):
+    found_total = 0
+    for genre_url in GENRE_URLS:
+        print(f"\n\U0001f3ac Loading genre: {genre_url}")
+        driver.get(genre_url)
+        time.sleep(4)
+        scroll_to_bottom(driver)
+        links = extract_movie_links(driver)
+        unique_links = [link for link in links if link not in seen_links]
+        duplicates = len(links) - len(unique_links)
+
+        if unique_links:
+            save_links(unique_links, genre_batch, "genre")
+            save_seen_links(unique_links)
+            print(f"\u2705 Found {len(unique_links)} new links. \u267b Skipped {duplicates} duplicates.")
+            genre_batch += 1
+            found_total += len(unique_links)
+        else:
+            print("\u26a0\ufe0f No new unique links in this genre.")
+        time.sleep(5)
+
+    print("📊 Genre scan complete. Found total", found_total, "new movie links.")
+
+def main():
+    print("🛠️ Max.com Movie Scraper — Created by Mike | DRMLab.io Project")
+    driver = setup_browser()
+
+    try:
+        print("\U0001f310 Opening Max.com homepage...")
+        driver.get("https://play.max.com")
+        time.sleep(3)
+
+        print("\n\U0001f511 Please log in manually in the browser.")
+        input("\u23f3 After you're logged in, press ENTER here to continue...")
+
+        seen_links = load_seen_links()
+        movie_batch = 1
+        search_batch = 1
+        genre_batch = 1
+
+        while True:
+            print("📋 === MAIN MENU ===")
+            print("1️⃣  Extract movies from /movies page")
+            print("2️⃣  Extract random movies via search")
+            print("3️⃣  Extract movies by genre")
+            print("4️⃣  Exit")
+            choice = input("Enter your choice: ").strip()
+
+            if choice == "1":
+                run_movies_mode(driver, seen_links, movie_batch)
+                movie_batch += 1
+            elif choice == "2":
+                run_search_mode(driver, seen_links, search_batch)
+                search_batch += 1
+            elif choice == "3":
+                run_genre_mode(driver, seen_links, genre_batch)
+                genre_batch += 1
+            elif choice == "4":
+                break
+            else:
+                print("\u274c Invalid choice.")
+
+    finally:
+        again = input("⏹ Do you want to close the browser? (yes/no): ").strip().lower()
+        if again != "yes":
+            main()
+        else:
+            input("✅ Press ENTER to exit.")
+        driver.quit()
+
+if __name__ == "__main__":
+    main()
+